In [28]:
import requests as rq
from bs4 import BeautifulSoup as bs
import pandas as pd
from time import sleep
from random import randint

In [29]:
# Read the player links from the csv file
df_links = pd.read_csv('../player_link_scrapping/output/player_links.csv')

# Convert the 'URL' column of the dataframe to a list
links = df_links['URL'].tolist()

# Define a dictionary headers to store the User-Agent string for the request
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;Win64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'}

In [None]:
# List to store data of each player in the form of a dictionary
list_of_row_dicts = []

# Loop through each link to scrape player data
for link in links:

    # Keep trying until the request is successful
    while True:
        try:
            # Make a GET request to the link
            request = rq.get(link,headers=headers)
        
            # Use BeautifulSoup to parse the HTML content of the page
            soup = bs(request.text, 'html.parser')
            
            # Find all <span> tags in the HTML
            title_spans = soup.find_all('span')
            
            # Break out of the loop if the request is successful
            break
        except AttributeError:
            # Print error message and wait for 10 seconds before retrying
            print('Index Error : Sleeping for 10 seconds before retrying')
            sleep(10)
            
    try:
        name = ' '.join([word for word in soup.find('h1').text.split() if not any(i.isdigit() for i in word)])
    except IndexError:
        name = None
        print ('Name not found')

    try:
        date_of_birth = [span.find_next('span').text for span in title_spans if 'Date of birth:' in span.text][0].strip()
    except IndexError:
        date_of_birth = None
        print ('DOB not found')

    try:
        city_of_birth = [span.find_next('span').text.strip() for span in title_spans if 'Place of birth:' in span.text][0]
    except IndexError:
        city_of_birth = None
        print ('City not found')

    try:
        country_of_birth = [span.find_next('span').find('img')['title'].strip() for span in title_spans if 'Place of birth:' in span.text][0]
    except IndexError:
        country_of_birth = None
        print ('Country not found')

    try:
        citizenship = [span.find_next('span').text.strip().split('\xa0\xa0') for span in title_spans if 'Citizenship:' in span.text][0]
    except AttributeError:
        citizenship = None
        print ('Citizenship not found')

    try:
        height = [span.find_next('span').text for span in title_spans if 'Height:' in span.text][0].split()[0]
    except IndexError:
        height = None
        print ('Height not found')

    try:
        foot = [span.find_next('span').text for span in title_spans if 'Foot:' in span.text][0]
    except IndexError:
        foot = None
        print ('Foot not found')

    try:
        agent = [span.find_next('span').text for span in title_spans if 'Player agent:' in span.text][0].strip()
    except IndexError:
        agent = None
        print ('Agent not found')

    try:
        outfitter = [span.find_next('span').text for span in title_spans if 'Outfitter:' in span.text][0]
    except IndexError:
        outfitter = None
        print ('Outfitter not found')

    try:
        main_position = soup.find('dt', text='Main position:').find_next('dd').text
    except AttributeError:
        main_position = None
        print ('Main Position not found')

    try:
        other_position = [dt.text for dt in soup.find('dt', text='Other position:').find_next_siblings('dd')]
    except AttributeError:
        other_position = None
        print ('Other Postion not found')

    try:
        youth_club = soup.find('div', class_='box tm-player-additional-data viewport-tracking').find('div', class_='content').text.strip().split(',')
    except AttributeError:
        youth_club = None
        print ('Youth Club not found')
            
    row_dic = {
    'Player_URL' : link,
    'Name' : name,
    'Date_of_Birth': date_of_birth,
    'City_of_Birth' : city_of_birth,
    'Country_of_Birth' : country_of_birth,
    'Citizenship' : citizenship,
    'Height' : height,
    'Foot' : foot,
    'Agent' : agent,
    'Outfitter' : outfitter,
    'Main_Position' : main_position,
    'Other_Position' : other_position,
    'Youth_Club' : youth_club
    }
    list_of_row_dicts.append(row_dic)
            
    print(name)
    sleep(randint(1,3))

df = pd.DataFrame(list_of_row_dicts)

In [31]:
df.to_csv('output/player_info.csv')