# Scrape from [transfermarkt](https://www.transfermarkt.com/major-league-soccer/startseite/wettbewerb/MLS1)

In [24]:
import bs4
import sys
import requests
import re
import pandas as pd
import numpy as np

In [123]:
def get_team_links(season):
    # season = 2020
    df = pd.DataFrame(columns=['team_name', 'club', 'team_num', 'link', 'season'])
    user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:68.0) Gecko/20100101 Firefox/68.0"
    url = f'https://www.transfermarkt.com/major-league-soccer/startseite/wettbewerb/MLS1/plus/?saison_id={season}'
    response = requests.get(url, headers={'User-Agent': user_agent})
    content = response.content.decode('utf-8')
    soup = bs4.BeautifulSoup(content, 'html.parser')
    table = soup.find('table', attrs={'class':'items'}).find('tbody')
    rows = table.find_all('tr')
    for row in rows:
        club = row.find('a').find('img')['alt']
        info = row.find('a')['href'].split('/')
        team_name = info[1]
        team_num = info[4]
        team_link = f"https://www.transfermarkt.com/{team_name}/kader/verein/{team_num}/saison_id/{season}/plus/1"
        df.loc[len(df.index)] = [team_name, club, team_num, team_link, season]
    return df


In [124]:
team_links = get_team_links(2020)

In [175]:
def get_player_bio(team_links_row):
    err = []
    df = pd.DataFrame(columns=['first_name', 'last_name', 'player', 'club', 'bday', 'join', 'value', 'season'])
    url = team_links_row['link']
    season = team_links_row['season']
    club = team_links_row['club']
    user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:68.0) Gecko/20100101 Firefox/68.0"
    response = requests.get(url, headers={'User-Agent': user_agent})
    content = response.content.decode('utf-8')
    soup = bs4.BeautifulSoup(content, 'html.parser')
    table = soup.find('table', attrs={'class':'items'}).find('tbody')
    rows = table.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        info = [ele.text.strip() for ele in cols]
        if len(info) > 3:
            s = [idx for idx in range(len(info[3])) if info[3][idx].isupper()][-2]
            name = info[3][:s]
            try:
                first, *last_temp = name.split()
                last = " ".join(last_temp).strip('.').strip()
            except:
                err.append([club, name])
                continue
            title = info[4]
            bday = info[5].split('(')[0].strip()
            if len(bday) < 8:
                bday = None
            height = info[7]
            foot = info[8]
            join = info[9]
            value = info[12]
            ddf = {
                'first_name': first,
                'last_name': last,
                'player': ". ".join([first[0].upper(), last]),
                'club': club,
                'bday': bday,
                'join': join,
                'value': value,
                'season': season
            }
            df = df.append(ddf, ignore_index=True)
            
    df['bday'] = pd.to_datetime(df['bday'])
    df['join'] = pd.to_datetime(df['join'])
    now = pd.Timestamp('now')
    df['age'] = (now - df['bday']).astype('<m8[Y]')
    df['join_age'] = (df['join'] - df['bday']).astype('<m8[Y]')
    
    df = df.drop(columns=['bday', 'join'])
        
    return df

In [176]:
dfs = []
for i in range(len(team_links)):
    dfs.append(get_player_bio(team_links.loc[i]))

player_df = pd.concat(dfs, ignore_index=True)

Unnamed: 0,first_name,last_name,player,club,value,season,age,join_age
0,Kenneth,Vermeer,K. Vermeer,Los Angeles FC,€800Th.,2020,35.0,34.0
1,Pablo,Sisniega,P. Sisniega,Los Angeles FC,€500Th.,2020,25.0,23.0
2,Tomás,Romero,T. Romero,Los Angeles FC,€100Th.,2020,20.0,20.0
3,Eddie,Segura,E. Segura,Los Angeles FC,€2.50m,2020,24.0,22.0
4,Jesús,Murillo,J. Murillo,Los Angeles FC,€1.20m,2020,27.0,26.0
...,...,...,...,...,...,...,...,...
723,Tate,Schmitt,T. Schmitt,Real Salt Lake City,€200Th.,2020,23.0,21.0
724,Christopher,Garcia,C. Garcia,Real Salt Lake City,€100Th.,2020,18.0,17.0
725,Douglas,Martínez,D. Martínez,Real Salt Lake City,€600Th.,2020,23.0,22.0
726,Rubio,Rubin,R. Rubin,Real Salt Lake City,€400Th.,2020,25.0,24.0


In [177]:
player_df = pd.concat(dfs, ignore_index=True)

In [192]:
player_df.to_csv('data/player_bio.csv')

In [213]:
def get_one_page(season, page=None):
    user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:68.0) Gecko/20100101 Firefox/68.0"
    url = f'https://www.mlssoccer.com/stats/season?page={page}&franchise=select&year={season}'
    response = requests.get(url, headers={'User-Agent': user_agent})
    content = response.content.decode('utf-8')
    soup = bs4.BeautifulSoup(content, 'html.parser')
    table = soup.find('tbody')
    rows = table.find_all('tr')
    cols = rows[0].find_all('td')
    df = pd.DataFrame(columns=[ele['data-title'] for ele in cols])
    for row in rows:
        cols = row.find_all('td')
        df.loc[len(df.index)] = [ele.text.strip() for ele in cols]
    return df

def get_player_stats(season):
    last = 27 # last page number, manually input here
    dfs = [get_one_page(season, page=p) for p in range(last)]
    return pd.concat(dfs, ignore_index=True)

In [216]:
stat_df = get_player_stats(2020)

In [217]:
stat_df.to_csv("data/play_time.csv")

In [223]:
sorted(player_df['club'].unique())

['Atlanta United FC',
 'Austin FC',
 'Chicago Fire FC',
 'Club de Foot Montréal',
 'Colorado Rapids',
 'Columbus Crew SC',
 'D.C. United',
 'FC Cincinnati',
 'FC Dallas',
 'Houston Dynamo FC',
 'Inter Miami CF',
 'Los Angeles FC',
 'Los Angeles Galaxy',
 'Minnesota United FC',
 'Nashville SC',
 'New England Revolution',
 'New York City FC',
 'New York Red Bulls',
 'Orlando City SC',
 'Philadelphia Union',
 'Portland Timbers',
 'Real Salt Lake City',
 'San Jose Earthquakes',
 'Seattle Sounders FC',
 'Sporting Kansas City',
 'Toronto FC',
 'Vancouver Whitecaps FC']

In [224]:
sorted(stat_df['Club'].unique())

['ATL',
 'ATX',
 'CHI',
 'CIN',
 'CLB',
 'CLT',
 'COL',
 'DAL',
 'DC',
 'HOU',
 'LA',
 'LAFC',
 'LFC',
 'MCF',
 'MIA',
 'MIN',
 'MTL',
 'NE',
 'NSH',
 'NY',
 'NYC',
 'ORL',
 'PHI',
 'POR',
 'RBNY',
 'RSL',
 'SEA',
 'SJ',
 'SKC',
 'TOR',
 'VAN']

In [229]:
stat_df['name'] = stat_df['Player']
stat_df

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,GWG,PKG/A,HmG,RdG,G/90min,SC%,name
0,Diego Rossi,LAFC,F,19,19,1698,14,4,71,34,1,1/1,11,3,0.74,19.7,Diego Rossi
1,Gyasi Zardes,CLB,F,21,20,1717,12,4,41,19,2,0/2,8,4,0.63,29.3,Gyasi Zardes
2,Raul Ruidiaz,SEA,F,17,17,1427,12,4,55,23,4,1/1,8,4,0.76,21.8,Raul Ruidiaz
3,Robert Beric,CHI,F,23,22,1931,12,1,62,29,1,1/2,7,5,0.56,19.4,Robert Beric
4,Jordan Morris,SEA,F,22,18,1724,10,8,39,18,3,0/0,8,2,0.52,25.6,Jordan Morris
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
668,Sebastian Mendez,ORL,M,19,11,1087,0,0,17,3,0,0/0,0,0,0.00,0.0,Sebastian Mendez
669,Kelyn Rowe,SEA,M,16,10,896,0,0,17,4,0,0/0,0,0,0.00,0.0,Kelyn Rowe
670,Siem de Jong,CIN,M,15,8,793,0,0,18,5,0,0/1,0,0,0.00,0.0,Siem de Jong
671,Maxime Chanot,NYC,D,20,20,1712,0,0,19,4,0,0/0,0,0,0.00,0.0,Maxime Chanot


In [270]:
from unidecode import unidecode
player_df['name'] = player_df['first_name'] +" "+ player_df['last_name']
player_df

ModuleNotFoundError: No module named 'unidecode'

In [253]:
df_raw = pd.merge(stat_df, player_df, on='name', how='left')

In [254]:
df = df_raw.dropna(subset=['club'])

In [255]:
df.reset_index(drop=True)

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,...,SC%,name,first_name,last_name,player,club,value,season,age,join_age
0,Diego Rossi,LAFC,F,19,19,1698,14,4,71,34,...,19.7,Diego Rossi,Diego,Rossi,D. Rossi,Los Angeles FC,€20.00m,2020,23.0,19.0
1,Gyasi Zardes,CLB,F,21,20,1717,12,4,41,19,...,29.3,Gyasi Zardes,Gyasi,Zardes,G. Zardes,Columbus Crew SC,€3.50m,2020,29.0,26.0
2,Robert Beric,CHI,F,23,22,1931,12,1,62,29,...,19.4,Robert Beric,Robert,Beric,R. Beric,Chicago Fire FC,€3.00m,2020,29.0,28.0
3,Chris Mueller,ORL,F,22,17,1477,10,7,36,19,...,27.8,Chris Mueller,Chris,Mueller,C. Mueller,Orlando City SC,€3.00m,2020,24.0,21.0
4,Alejandro Pozuelo,TOR,M,23,23,2015,9,10,52,25,...,17.3,Alejandro Pozuelo,Alejandro,Pozuelo,A. Pozuelo,Toronto FC,€12.00m,2020,29.0,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387,Jonathan Mensah,CLB,D,23,23,2070,0,0,10,4,...,0.0,Jonathan Mensah,Jonathan,Mensah,J. Mensah,Columbus Crew SC,€1.00m,2020,30.0,26.0
388,Marcelo Silva,RSL,D,15,15,1309,0,0,12,4,...,0.0,Marcelo Silva,Marcelo,Silva,M. Silva,Real Salt Lake City,€400Th.,2020,31.0,28.0
389,Sacha Kljestan,LA,M,15,8,769,0,0,13,4,...,0.0,Sacha Kljestan,Sacha,Kljestan,S. Kljestan,Los Angeles Galaxy,€350Th.,2020,35.0,34.0
390,Kelyn Rowe,SEA,M,16,10,896,0,0,17,4,...,0.0,Kelyn Rowe,Kelyn,Rowe,K. Rowe,Seattle Sounders FC,€600Th.,2020,29.0,29.0


In [258]:
df[df['club']=='New York City FC']

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,...,SC%,name,first_name,last_name,player,club,value,season,age,join_age
56,Anton Tinnerholm,NYC,D,23,22,2001,4,4,21,6,...,19.0,Anton Tinnerholm,Anton,Tinnerholm,A. Tinnerholm,New York City FC,€2.50m,2020,30.0,26.0
102,Keaton Parks,NYC,M,23,20,1733,3,1,40,16,...,7.5,Keaton Parks,Keaton,Parks,K. Parks,New York City FC,€3.00m,2020,23.0,22.0
165,Alexander Callens,NYC,D,22,22,1934,2,0,9,3,...,22.2,Alexander Callens,Alexander,Callens,A. Callens,New York City FC,€1.80m,2020,28.0,24.0
253,Tony Rocha,NYC,M,11,0,51,1,0,3,1,...,33.3,Tony Rocha,Tony,Rocha,T. Rocha,New York City FC,€450Th.,2020,27.0,25.0
372,Gudmundur Thórarinsson,NYC,D,19,7,683,0,1,5,1,...,0.0,Gudmundur Thórarinsson,Gudmundur,Thórarinsson,G. Thórarinsson,New York City FC,€800Th.,2020,28.0,27.0
374,James Sands,NYC,M,16,16,1409,0,1,5,0,...,0.0,James Sands,James,Sands,J. Sands,New York City FC,€2.50m,2020,20.0,16.0
469,Sebastien Ibeagha,NYC,D,7,2,282,0,0,0,0,...,0.0,Sebastien Ibeagha,Sebastien,Ibeagha,S. Ibeagha,New York City FC,€500Th.,2020,29.0,26.0
493,Tayvon Gray,NYC,D,0,0,0,0,0,0,0,...,0.0,Tayvon Gray,Tayvon,Gray,T. Gray,New York City FC,€50Th.,2020,18.0,17.0
503,Justin Haak,NYC,M,0,0,0,0,0,0,0,...,0.0,Justin Haak,Justin,Haak,J. Haak,New York City FC,€200Th.,2020,19.0,17.0
671,Maxime Chanot,NYC,D,20,20,1712,0,0,19,4,...,0.0,Maxime Chanot,Maxime,Chanot,M. Chanot,New York City FC,€1.00m,2020,31.0,26.0


In [267]:
df_1 = stat_df[stat_df['Club']=='NYC'].reset_index(drop=True)
df_2 = player_df[player_df['club']=='New York City FC'].reset_index(drop=True)
pd.merge(df_1, df_2, on='name', how='outer').sort_values(by='first_name')

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,...,SC%,name,first_name,last_name,player,club,value,season,age,join_age
7,Alexander Callens,NYC,D,22.0,22.0,1934.0,2.0,0.0,9.0,3.0,...,22.2,Alexander Callens,Alexander,Callens,A. Callens,New York City FC,€1.80m,2020.0,28.0,24.0
28,,,,,,,,,,,...,,Andres Jasson,Andres,Jasson,A. Jasson,New York City FC,€50Th.,2020.0,19.0,18.0
2,Anton Tinnerholm,NYC,D,23.0,22.0,2001.0,4.0,4.0,21.0,6.0,...,19.0,Anton Tinnerholm,Anton,Tinnerholm,A. Tinnerholm,New York City FC,€2.50m,2020.0,30.0,26.0
22,,,,,,,,,,,...,,Cody Mizell,Cody,Mizell,C. Mizell,New York City FC,€175Th.,2020.0,29.0,29.0
11,Gudmundur Thórarinsson,NYC,D,19.0,7.0,683.0,0.0,1.0,5.0,1.0,...,0.0,Gudmundur Thórarinsson,Gudmundur,Thórarinsson,G. Thórarinsson,New York City FC,€800Th.,2020.0,28.0,27.0
30,,,,,,,,,,,...,,Ismael Tajouri-ShradiI,Ismael,Tajouri-ShradiI,I. Tajouri-ShradiI,New York City FC,€1.00m,2020.0,26.0,23.0
12,James Sands,NYC,M,16.0,16.0,1409.0,0.0,1.0,5.0,0.0,...,0.0,James Sands,James,Sands,J. Sands,New York City FC,€2.50m,2020.0,20.0,16.0
29,,,,,,,,,,,...,,Jesús Medina,Jesús,Medina,J. Medina,New York City FC,€3.00m,2020.0,23.0,20.0
26,,,,,,,,,,,...,,Juan Torres,Juan,Torres,J. Torres,New York City FC,€150Th.,2020.0,21.0,19.0
17,Justin Haak,NYC,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,Justin Haak,Justin,Haak,J. Haak,New York City FC,€200Th.,2020.0,19.0,17.0


In [269]:
len(df_1), len(df_2)

(21, 21)