In [5]:
import os, time
from bs4 import BeautifulSoup
import numpy as np
from selenium import webdriver

In [28]:
# local directory settings
main_dir = os.getcwd()
chrome_driver_dir = main_dir+r'\\chromedriver.exe'
player_info_dir = main_dir+r'\\Player Information\\'
match_lineup_dir = main_dir+r'\\Match Lineup Information\\'

# make directory files
try:
    os.chdir(player_info_dir)
    os.chdir(main_dir)
except FileNotFoundError:
    os.mkdir('Player Information')
try:
    os.chdir(match_lineup_dir)
    os.chdir(main_dir)
except FileNotFoundError:
    os.mkdir('Match Lineup Information')
    
# data file storing player_id in column 0 and player_name in column 1
player_id_csv = player_info_dir+'player_id.csv'


def get_player_name(player_id, chrome_driver_dir=chrome_driver_dir):
    ''' (int) -> str
    Given player_id, return player name:
        - If player_id already searched, search from player_id.csv file.
        - Else, search from web.
        - Return 'Unregistered' if player_id == 0.
    '''
    # player_id = 0: no player information
    if player_id == '0': return 'Unregistered'
    csv_empty = False # boolean for if player_id_csv is empty or not
    # check if player_id is already stored in 'Player Information\player_id_csv'
    try: player_id_data = np.genfromtxt(player_id_csv, dtype='str',\
                                        delimiter=',', encoding='utf8')
    except OSError: # player_id.csv not made yet, so make one
        data = open(player_id_csv, 'w', encoding='utf8')
        data.write('player_id,player_name\n')
        data.close()
        player_id_data = np.genfromtxt(player_id_csv, dtype='str',\
                                       delimiter=',', encoding='utf8')
    
    if np.size(player_id_data) == 2: # empty csv file
        csv_empty = True
        L_player_id = np.array([]) # empty list of player id's
        L_player_name = np.array([]) # empty list of player names
    
    if not csv_empty: # code goes here only if csv file is not empty
        # fetch data from player_id_csv
        L_player_id = player_id_data[1:,0] # list of all player id's
        L_player_name = player_id_data[1:,1] # list of all player names
        # check if player_id is in player_id_csv
        if player_id in L_player_id: # player_id already stored
            i_player_id = np.where(L_player_id == player_id)[0][0]
            return L_player_name[i_player_id]
    
    # the code will continue if player_id is new to player_id_csv
    print('\t\tadding new player to the database; player_id = '+str(player_id))
    # define url's
    base_url = "https://www.whoscored.com/"
    player_url = base_url + 'Players/'+str(player_id)
    
    # access player info page
    driver = webdriver.Chrome(chrome_driver_dir)
    driver.get(player_url)
    driver.implicitly_wait(5) # wait to avoid no-load error
    time.sleep(10)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # get player name
    try: player_name = soup.find('meta', attrs={'name':'title'})['content'][:-10]
    except TypeError: return 'Unregistered' # player data not found
    
    # close driver
    driver.delete_all_cookies()
    driver.close()
    
    # save the new player_id and player_name to player_id_csv file
    L_player_id = np.concatenate((['player_id'], L_player_id, [player_id]))
    L_player_name = np.concatenate((['player_name'], L_player_name, [player_name]))
    np.savetxt(player_id_csv, np.transpose(np.array([L_player_id, L_player_name])),\
               fmt='%s', delimiter=',', encoding='utf8')
    
    return player_name


def match_lineup(match_id, driver):
    ''' (int or str, (str)) -> list, array of arrays, list, array of arrays
    Given the match_id (and directory for chromedriver.exe), return 'home team info list',
    'home team lineup array', 'away team info list', and 'away team lineup array' in the
    following data structure:
    'L_team': [team_id, team_name],
    'L_team_lineup': [(player_id, player_name, player_position, played_time/90.0)_i].
    '''
    print('\tGathering match lineup data for match; match_id = '+str(match_id))
    # define url's
    base_url = "https://www.whoscored.com/"
    match_url = base_url + 'Matches/'+str(match_id)
    player_url = base_url + 'Players/'
    
    # define player roles
    role = ['Goalkeeper', 'Defender', 'Midfielder', 'Forward']
    
    # list for storing information
    L_home, L_home_lineup, L_away, L_away_lineup = [], [], [], []
    _L_total = [L_home_lineup, L_away_lineup] # variable defined for convenience
    
    # access match_id match
    driver.get(match_url) # go to match_url
    
    # access url with lineup and substitution information
    match_soup = BeautifulSoup(driver.page_source, 'lxml')
    match_center_href = match_soup.find('a', href=True, text='Match Centre')
    lineup_url = base_url + match_center_href['href']
    driver.get(lineup_url) # go to lineup_url
    driver.implicitly_wait(5) # wait to avoid no-load error
    time.sleep(10)
    lineup_soup = BeautifulSoup(driver.page_source)

    # access team information
    home_team = lineup_soup.find_all(attrs={"class":"match-centre-header-team",\
                                            "data-field":"home"})
    #print(home_team)
    away_team = lineup_soup.find_all(attrs={"class":"match-centre-header-team",\
                                            "data-field":"away"})
    #print(away_team)
    home_team_id = str(home_team[0].find('a', class_='team-name')['href']).split('/')[2]
    #print(home_team_id)
    home_team_name = str(home_team[0].find('a', class_='team-name')).split('>')[1][:-3]
    away_team_id = str(away_team[0].find('a', class_='team-name')['href']).split('/')[2]
    away_team_name = str(away_team[0].find('a', class_='team-name')).split('>')[1][:-3]
    L_home = np.array([str(home_team_id), home_team_name])
    L_away = np.array([str(away_team_id), away_team_name])
    
    # access starting lineup information
    players_home_starters = (lineup_soup.find_all(attrs={"class":"player", "style":True,\
                                                         "data-field":"home"}))
    players_away_starters = (lineup_soup.find_all(attrs={"class":"player", "style":True,\
                                                         "data-field":"away"}))
    
    # get home & away player name, player id, and player position for starters
    for i in range(2):
        for player in [players_home_starters, players_away_starters][i]:
            player_name = player.find('div', class_='player-name-wrapper',
                                      title=True)['title'] # player name
            player_id = player['data-player-id'] # id tag for each player
            # deduce player position from team formation
            player_position = float(player['style'].split(' ')[1].\
                                    strip().replace('%','').replace(';',''))
            if player_position < 10: player_position = role[0]
            elif player_position < 36: player_position = role[1]
            elif player_position < 58.5: player_position = role[2]
            else: player_position = role[3]
            # store player information in strings; assign played_time/90 = 1 for now
            _L_total[i].append([str(player_id), str(player_name),\
                                str(player_position), '1.0'])
    
    # change data type to np.array for easier handling
    _L_total = [np.array(L_home_lineup), np.array(L_away_lineup)]

    # access substitution information
    home_sub = lineup_soup.find_all(class_='key-incident home-incident')
    away_sub = lineup_soup.find_all(class_='key-incident away-incident')
    for i in range(2):
        global events, substitutions
        substitutions = [home_sub, away_sub][i]
        #print(substitutions)
        # find substitution information
        subbings_on_id, subbings_on_time, subbings_off_id, subbings_off_time = [], [], [], []
        for events in substitutions:
            #print(events)
            sub_on = events.find_all(attrs={'data-minute':True, 'data-second':True,\
                                           'data-event-satisfier-subon':True})
            #print(sub_on)
            sub_off = events.find_all(attrs={'data-minute':True, 'data-second':True,\
                                             'data-event-satisfier-suboff':True})
            #print(sub_off)
            if sub_on != []:
                sub_minute=sub_on[0]['data-minute']
                sub_secs=sub_on[0]['data-second']
                sub_on_player_id = sub_on[0]['data-player-id']
                subbings_on_id.append(int(sub_on_player_id))
                subbings_on_time.append([int(sub_minute), int(sub_secs)])
            if sub_off != []:
                sub_minute=sub_off[0]['data-minute']
                sub_secs=sub_off[0]['data-second']
                sub_off_player_id = sub_off[0]['data-player-id']
                subbings_off_id.append(int(sub_off_player_id))
                subbings_off_time.append([int(sub_minute), int(sub_secs)])
        print('sub_on sub_off')
        print(subbings_on_id)
        print(subbings_off_id)
        sub_player_names = lineup_soup.find_all(attrs={"class":"player-name", "href":True})
        #print(sub_player_names)
        L_sub_player_id, L_sub_player_name = [], []
        # get all available player id-name information
        for player_i in sub_player_names:
            if player_i['href'] != '':
                #print(player_i)
                if (player_i['href'].split('/')[2]) !='':
                    player_i_id = int(player_i['href'].split('/')[2])
                    player_i_name = player_i.get_text()
                    L_sub_player_id.append(player_i_id)
                    L_sub_player_name.append(player_i_name)
        # search player name based on player id
        for _i in range(len(subbings_off_id)):
            # 1. find the name of the player subbing off from L_home/L_away, and
            # change the played_time of the player
            sub_off_player_id, sub_off_player_name = subbings_off_id[_i],\
                                                     L_sub_player_name[L_sub_player_id.index(subbings_off_id[_i])]
            sub_time = subbings_off_time[_i][0] + subbings_off_time[_i][0]/60.0
            print(_L_total[i][:,0], sub_off_player_id, np.where(_L_total[i][:,0] == str(sub_off_player_id)))
            sub_off_i = np.where(_L_total[i][:,0] == str(sub_off_player_id))[0][0]
            _L_total[i][sub_off_i,3] = str(sub_time/90.0)
            # 2. find the name of the player subbing on from L_home/L_away, and
            # add the player information
            sub_on_player_name = L_sub_player_name[L_sub_player_id.index(subbings_on_id[_i])] 
            # assume the playing position of subbed player is the same
            sub_on_player_position = _L_total[i][sub_off_i,2]
            sub_on_player_time = str(1.0-sub_time/90.0)
            _L_total[i] = np.concatenate((_L_total[i], [[str(subbings_on_id[_i]),\
                                                         str(sub_on_player_name),\
                                                         str(sub_on_player_position),\
                                                         str(sub_on_player_time)]]))
    L_home_lineup, L_away_lineup = _L_total

    # for fewer than 3 substitutions, add ('', '', '', '0.0')
    n_home_player_played = np.shape(L_home_lineup)[0]
    n_away_player_played = np.shape(L_away_lineup)[0]
    if n_home_player_played < 14:
        for i in range(14-n_home_player_played):
            L_home_lineup = np.concatenate((L_home_lineup, [['', '', '', '0.0']]))
    if n_away_player_played < 14:
        for i in range(14-n_away_player_played):
            L_away_lineup = np.concatenate((L_away_lineup, [['', '', '', '0.0']]))
    
    print('\tmatch_id: '+str(match_id)+', complete!')

    return L_home, L_away, L_home_lineup, L_away_lineup


def record_match_lineup(match_id, match_lineup_csv, driver):
    ''' (int, str, driver) -> None
    Given match_id, record match lineup information in match_lineup_csv, with
    columns being: "match_id, home_team_id, home_team_name,\
                    (player_id, player_name, player_position, played_time)_i=1~14,\
                    away_team_id, away_team_name,\
                    (player_id, player_name, player_position, played_time)_i=1~14".
    For fewer than 3 substitutions in match, fill in the columns with ('', '', '', '0.0').
    '''
    # get match lineup information
    L_home, L_away, L_home_lineup, L_away_lineup = match_lineup(match_id, driver)
        
    # open match_lineup_csv to add information
    data = open(match_lineup_csv, 'a', encoding='utf8')
    new_line = str(match_id)+','
    new_line += ','.join(L_home)+','
    new_line += ','.join(L_away)+','
    new_line += ','.join(np.ndarray.flatten(L_home_lineup))+','
    new_line += ','.join(np.ndarray.flatten(L_away_lineup))+'\n'
    data.write(new_line)
    data.close()
    
    print('\tmatch id: '+str(match_id)+', recorded.')
    
    return None

def check_file_exists(match_lineup_csv):
    # check if match_lineup_csv is empty or not
    try:
        data = open(match_lineup_csv, 'r', encoding='utf8')
        data.close()
    except OSError or FileNotFoundError: # match_lineup.csv not made yet, so make one
        data = open(match_lineup_csv, 'a') # 'a' instead of 'w' to be safe
        # write header
        header = 'match_id,h_id,h_name,a_id,a_name,'
        h_player_header, a_player_header = '', ''
        for i in range(1,15):
            # home player header
            h_player_i = 'h_p'+str(i)+'_'
            h_player_header += h_player_i+'id,'   # player_i id
            h_player_header += h_player_i+'name,' # player_i name
            h_player_header += h_player_i+'pos,'  # player_i position
            h_player_header += h_player_i+'t,'    # player_i (played) time
            # away player header
            a_player_i = 'a_p'+str(i)+'_'
            a_player_header += a_player_i+'id,'   # player_i id
            a_player_header += a_player_i+'name,' # player_i name
            a_player_header += a_player_i+'pos,'  # player_i position
            a_player_header += a_player_i+'t,'    # player_i (played) time
        header = header + h_player_header + a_player_header
        header = header[:-1] + '\n'
        data.write(header)
        data.close()
    return None

def open_driver(chrome_driver_dir=chrome_driver_dir):
    driver = webdriver.Chrome(chrome_driver_dir)
    return driver

def close_driver(driver):
    if driver == None: return None
    driver.delete_all_cookies()
    driver.close()
    return None

def get_match_id(match_id_csv):
    data = np.genfromtxt(match_id_csv, delimiter=',', dtype='str', encoding='utf8')
    return data[1:,-1]
    

In [None]:
# Generate match roster csv
last_id = 829716 # before crashing
# note: 317896 -> no sub-in player info for 90' Burnley sub
#829661 had an error of integer in base 10 something something

# local directory settings
main_dir = os.getcwd()
chrome_driver_dir = main_dir+r'\\chromedriver.exe'
match_id_dir = main_dir+r'\\Match ID\\'

# list of csv names for each season match info
L_csv_lineup = ['match_lineup_1415.csv',
               'match_lineup_1516.csv',
               'match_lineup_1617.csv',
               'match_lineup_1718.csv',
               'match_lineup_1819.csv']

# list of csv names for match_id
L_csv_match = ['match_id_1415.csv',
               'match_id_1516.csv',
               'match_id_1617.csv',
               'match_id_1718.csv',
               'match_id_1819.csv']

for i in range(len(L_csv_lineup)): # iterate over each each season
    csv_lineup = match_lineup_dir+L_csv_lineup[i]
    check_file_exists(csv_lineup)
    L_match_id = get_match_id(match_id_dir+L_csv_match[i])
    driver = None
    print('season: '+L_csv_lineup[i])
    _L_match_id = []
    for j in range(len(L_match_id)):
        if int(L_match_id[j]) >= last_id: _L_match_id.append(L_match_id[j])
    L_match_id = _L_match_id
    for j in range(len(L_match_id)): # iterate over each match
        if j % 10 == 0: # refresh for every 10 searches
            close_driver(driver)
            print('\trefreshing driver...')
            driver = open_driver()
        t_i = time.time()
        record_match_lineup(L_match_id[j], csv_lineup, driver)
        t_f = time.time()
        print('\t'+L_match_id[j]+', elapsed time: '+str(round(t_f-t_i,2))+' seconds')
    driver.close()

season: match_lineup_1415.csv
	refreshing driver...
	Gathering match lineup data for match; match_id = 829716
sub_on sub_off
[124316, 69738]
[845, 39308]
['73379' '11367' '6292' '23072' '27550' '845' '6321' '4522' '25244'
 '24444' '39308'] 845 (array([5], dtype=int64),)
['73379' '11367' '6292' '23072' '27550' '845' '6321' '4522' '25244'
 '24444' '39308' '124316'] 39308 (array([10], dtype=int64),)
sub_on sub_off
[35460, 5025]
[24366, 43742]
['7890' '928' '11669' '76499' '33878' '24366' '7411' '8213' '43742'
 '35237' '82102'] 24366 (array([5], dtype=int64),)
['7890' '928' '11669' '76499' '33878' '24366' '7411' '8213' '43742'
 '35237' '82102' '35460'] 43742 (array([8], dtype=int64),)
	match_id: 829716, complete!
	match id: 829716, recorded.
	829716, elapsed time: 101.28 seconds
	Gathering match lineup data for match; match_id = 829717
sub_on sub_off
[26298, 41489]
[9766, 73382]
['43948' '82394' '3841' '71268' '2223' '14268' '73382' '11235' '33833'
 '35847' '9766'] 9766 (array([10], dtype=

sub_on sub_off
[17, 105577]
[21683, 80767]
['52197' '4574' '19840' '3817' '113275' '31451' '68659' '21683' '80767'
 '97692' '8409'] 21683 (array([7], dtype=int64),)
['52197' '4574' '19840' '3817' '113275' '31451' '68659' '21683' '80767'
 '97692' '8409' '17'] 80767 (array([8], dtype=int64),)
sub_on sub_off
[41489, 35847]
[26298, 14269]
['43948' '82394' '3841' '1019' '2223' '35605' '14269' '19674' '11235'
 '73382' '26298'] 26298 (array([10], dtype=int64),)
['43948' '82394' '3841' '1019' '2223' '35605' '14269' '19674' '11235'
 '73382' '26298' '41489'] 14269 (array([6], dtype=int64),)
	match_id: 829725, complete!
	match id: 829725, recorded.
	829725, elapsed time: 65.42 seconds
	refreshing driver...
	Gathering match lineup data for match; match_id = 829726
sub_on sub_off
[29762]
[34693]
['23122' '38772' '75177' '29798' '24148' '3860' '39935' '8505' '34693'
 '26013' '3807'] 34693 (array([8], dtype=int64),)
sub_on sub_off
[71174, 4092]
[18701, 8166]
['79554' '71345' '81726' '22079' '2115' '8