# Scrape from [transfermarkt](https://www.transfermarkt.com/major-league-soccer/startseite/wettbewerb/MLS1)

In [24]:
import bs4
import sys
import requests
import re
import pandas as pd
import numpy as np

In [123]:
def get_team_links(season):
    # season = 2020
    df = pd.DataFrame(columns=['team_name', 'club', 'team_num', 'link', 'season'])
    user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:68.0) Gecko/20100101 Firefox/68.0"
    url = f'https://www.transfermarkt.com/major-league-soccer/startseite/wettbewerb/MLS1/plus/?saison_id={season}'
    response = requests.get(url, headers={'User-Agent': user_agent})
    content = response.content.decode('utf-8')
    soup = bs4.BeautifulSoup(content, 'html.parser')
    table = soup.find('table', attrs={'class':'items'}).find('tbody')
    rows = table.find_all('tr')
    for row in rows:
        club = row.find('a').find('img')['alt']
        info = row.find('a')['href'].split('/')
        team_name = info[1]
        team_num = info[4]
        team_link = f"https://www.transfermarkt.com/{team_name}/kader/verein/{team_num}/saison_id/{season}/plus/1"
        df.loc[len(df.index)] = [team_name, club, team_num, team_link, season]
    return df


In [124]:
team_links = get_team_links(2020)

In [175]:
def get_player_bio(team_links_row):
    err = []
    df = pd.DataFrame(columns=['first_name', 'last_name', 'player', 'club', 'bday', 'join', 'value', 'season'])
    url = team_links_row['link']
    season = team_links_row['season']
    club = team_links_row['club']
    user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:68.0) Gecko/20100101 Firefox/68.0"
    response = requests.get(url, headers={'User-Agent': user_agent})
    content = response.content.decode('utf-8')
    soup = bs4.BeautifulSoup(content, 'html.parser')
    table = soup.find('table', attrs={'class':'items'}).find('tbody')
    rows = table.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        info = [ele.text.strip() for ele in cols]
        if len(info) > 3:
            s = [idx for idx in range(len(info[3])) if info[3][idx].isupper()][-2]
            name = info[3][:s]
            try:
                first, *last_temp = name.split()
                last = " ".join(last_temp).strip('.').strip()
            except:
                err.append([club, name])
                continue
            title = info[4]
            bday = info[5].split('(')[0].strip()
            if len(bday) < 8:
                bday = None
            height = info[7]
            foot = info[8]
            join = info[9]
            value = info[12]
            ddf = {
                'first_name': first,
                'last_name': last,
                'player': ". ".join([first[0].upper(), last]),
                'club': club,
                'bday': bday,
                'join': join,
                'value': value,
                'season': season
            }
            df = df.append(ddf, ignore_index=True)
            
    df['bday'] = pd.to_datetime(df['bday'])
    df['join'] = pd.to_datetime(df['join'])
    now = pd.Timestamp('now')
    df['age'] = (now - df['bday']).astype('<m8[Y]')
    df['join_age'] = (df['join'] - df['bday']).astype('<m8[Y]')
    
    df = df.drop(columns=['bday', 'join'])
        
    return df

In [176]:
dfs = []
for i in range(len(team_links)):
    dfs.append(get_player_bio(team_links.loc[i]))

player_df = pd.concat(dfs, ignore_index=True)

Unnamed: 0,first_name,last_name,player,club,value,season,age,join_age
0,Kenneth,Vermeer,K. Vermeer,Los Angeles FC,€800Th.,2020,35.0,34.0
1,Pablo,Sisniega,P. Sisniega,Los Angeles FC,€500Th.,2020,25.0,23.0
2,Tomás,Romero,T. Romero,Los Angeles FC,€100Th.,2020,20.0,20.0
3,Eddie,Segura,E. Segura,Los Angeles FC,€2.50m,2020,24.0,22.0
4,Jesús,Murillo,J. Murillo,Los Angeles FC,€1.20m,2020,27.0,26.0
...,...,...,...,...,...,...,...,...
723,Tate,Schmitt,T. Schmitt,Real Salt Lake City,€200Th.,2020,23.0,21.0
724,Christopher,Garcia,C. Garcia,Real Salt Lake City,€100Th.,2020,18.0,17.0
725,Douglas,Martínez,D. Martínez,Real Salt Lake City,€600Th.,2020,23.0,22.0
726,Rubio,Rubin,R. Rubin,Real Salt Lake City,€400Th.,2020,25.0,24.0


In [177]:
player_df = pd.concat(dfs, ignore_index=True)

In [192]:
player_df.to_csv('data/player_bio.csv')

In [198]:
# def get_play_time(season):
season = 2020
df = pd.DataFrame(columns=['team_name', 'club', 'team_num', 'link', 'season'])
user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:68.0) Gecko/20100101 Firefox/68.0"
url = f'https://www.mlssoccer.com/stats/season?franchise=select&year={season}'
response = requests.get(url, headers={'User-Agent': user_agent})
content = response.content.decode('utf-8')
soup = bs4.BeautifulSoup(content, 'html.parser')
table = soup.find('tbody')
rows = table.find_all('tr')
#     df.loc[len(df.index)] = [team_name, club, team_num, team_link, season]
# return df

In [200]:
rows[0]

<tr class="odd"><td data-title="Player"><a href="/players/diego-rossi">Diego Rossi</a></td><td data-title="Club">LAFC</td><td data-title="POS">F</td><td data-title="GP">19</td><td data-title="GS">19</td><td data-title="MINS">1698</td><td data-title="G">14</td><td data-title="A">4</td><td data-title="SHTS">71</td><td data-title="SOG">34</td><td data-title="GWG">1</td><td data-title="PKG/A">1/1</td><td data-title="HmG">11</td><td data-title="RdG">3</td><td data-title="G/90min">0.74</td><td data-title="SC%">19.7</td> </tr>