In [1]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd

In [2]:
base_url = 'https://sofifa.com'

ids = {'Manchester City': 3466, 'Chelsea': 3475, 'Liverpool': 3462, 
       'Manchester United': 3457, 'Tottenham Hotspur': 3470, 'Liverpool': 3459,
       'Everton': 3467, 'Burnley': 4234, 'Leicester City': 8021, 'West Ham United': 3463,
       'Southampton': 6504, 'Crystal Palace': 7261, 'Watford': 8784, 'Fulham': 3474,
       'Newcastle United': 3458, 'Wolverhampton Wanderers': 4225,
       'Brighton & Hove Albion': -1, 'Bournemouth': 8779, 'Huddersfield Town': -1, 
       'Cardiff City': 8344, 'Arsenal': 3459, 'Stoke City': 3472,
       'West Bromwich Albion': 3460, 'Swansea City': 5744}
       
       
team_api_ids = {'Manchester City': 8456, 'Chelsea': 8455, 'Liverpool': 8650, 
                'Manchester United': 10260, 'Tottenham Hotspur': 8586, 'Liverpool': 9825,
                'Everton': 8668, 'Burnley': 8191, 'Leicester City': 8197, 'West Ham United': 8654,
                'Southampton': 8466, 'Crystal Palace': 9826, 'Watford': 9817, 'Fulham': 9879,
                'Newcastle United': 10261, 'Wolverhampton Wanderers': 8602, 
                'Brighton & Hove Albion': -1, 'Bournemouth': 8678, 'Huddersfield Town': -1, 
                'Cardiff City': 7276, 'Arsenal': 9825, 'Stoke City': 10194, 
                'West Bromwich Albion': 8659, 'Swansea City': 10003}

column_headers = ['id', 'team_fifa_api_id', 'date', 'buildUpPlaySpeed', 'buildUpPlaySpeedClass',
                 'BuildUpPlayDribbling', 'BuildUpPlayDribblingClass', 'BuildupPlayPositioningClass',
                 'buildUpPlayPassing', 'buildUpPlayPassingClass', 'buildUpPlayPositioningClass',
                 'chanceCreationClass', 'chanceCreationCrossing', 'chanceCreationCrossingClass',
                 'chanceCreationShooting', 'chanceCreationShootingClass',
                 'chanceCreationPositioningClass', 'defencePressure', 'defencePressureClass',
                 'defenceAgression', 'defenceAgressionClass', 'defenceTeamWidth',
                 'defenceTeamWidthClass', 'defenceDefenderLineClass']

In [6]:
def soup_maker(url):
    r = requests.get(url)
    markup = r.content
    soup = bs(markup, 'lxml')
    return soup


def find_team_links(soup):
    table = soup.find('table', {'class': 'table table-hover persist-area'})  # Table of teams
    tbody = table.find('tbody')
    all_a = tbody.find_all('a', {'class': ''})
    return [base_url + link['href'] + year_extension for link in all_a if '/team/' in link['href']]


def get_team_attributes(soup):
    df_row = pd.DataFrame(columns=column_headers)
    info = soup.find('div', {'class': 'info'}).find('h1').text.split('(')
    team_name = info[0].strip()
    team_fifa_api_id = info[1].split(': ')[1].split(')')[0]
    
    df_row.loc[0, 'id'] = str(ids[team_name])
    df_row.loc[0, 'team_api_id'] = str(team_api_ids[team_name])
    df_row.loc[0, 'team_fifa_api_id'] = team_fifa_api_id
    
    card_div = soup.find('div', {'class': 'card mb-2'})
    attributes_div = card_div.find('div', {'class': 'card-body'})
    data = attributes_div.find_all('dd')
    start = False
    i = 3
    for datum in data:
        category = datum.find('span', {'class': ['tooltip', 'multiline']})
        if start or 'Speed' in category.text:
            start = True
            category = 'Dribbling' if category is None else category.text
            vals = datum.find('span', {'class': 'float-right'}).text
            if category != 'Positioning' and category != 'Defender Line':
                vals = vals.split()
            else:
                vals = [vals]
            for val in vals:
                df_row.loc[0, column_headers[i]] = val
                i += 1 
    return df_row


def get_all_team_attributes(team_urls):
    team_attrs = pd.DataFrame(columns=column_headers)
    for team_url in team_urls:
        team_soup = soup_maker(team_url)
        df_row = get_team_attributes(team_soup)
        team_attrs = team_attrs.append(df_row, ignore_index=True)
    return team_attrs

In [7]:
team_attributes = pd.DataFrame(columns=column_headers)
years = [2018, 2019]
for year in years:
    if year == 2018:
        year_extension = '?lg%5B0%5D=13&v=18&e=158865&set=true' # BPL Fifa 18, Sept 28 2017
        date = '2018-09-28 00:00:00'
    elif year == 2019:
        year_extension = '?lg%5B0%5D=13&v=19&e=159229&set=true'  # BPL Fifa 19, Sept 27 2018
        date = '2019-07-28 00:00:00'
        
    teams_url = base_url + '/teams' + year_extension
    teams_soup = soup_maker(teams_url)
    team_urls = find_team_links(teams_soup)
    df = get_all_team_attributes(team_urls)
    df['date'] = date
    team_attributes = team_attributes.append(df, ignore_index=True)

In [10]:
team_attributes

Unnamed: 0,BuildUpPlayDribbling,BuildUpPlayDribblingClass,BuildupPlayPositioningClass,buildUpPlayPassing,buildUpPlayPassingClass,buildUpPlayPositioningClass,buildUpPlaySpeed,buildUpPlaySpeedClass,chanceCreationClass,chanceCreationCrossing,...,defenceAgression,defenceAgressionClass,defenceDefenderLineClass,defencePressure,defencePressureClass,defenceTeamWidth,defenceTeamWidthClass,id,team_api_id,team_fifa_api_id
0,50,Normal,38,Mixed,Organised,31,36,Balanced,Safe,41,...,51,Press,Cover,67,High,51,Normal,3475,8455,5
1,47,Normal,30,Short,Organised,33,29,Slow,Safe,53,...,53,Press,Cover,67,High,33,Narrow,3459,9825,1
2,50,Normal,28,Short,Free Form,37,30,Slow,Normal,36,...,52,Press,Cover,60,Medium,66,Normal,3466,8456,10
3,52,Normal,49,Mixed,Organised,36,50,Balanced,Normal,62,...,52,Press,Cover,66,Medium,52,Normal,3457,10260,11
4,76,Lots,53,Mixed,Organised,50,49,Balanced,Normal,56,...,66,Press,Cover,67,High,66,Normal,3459,9825,9
5,22,Little,38,Mixed,Organised,33,38,Balanced,Safe,33,...,63,Press,Cover,68,High,63,Normal,3470,8586,18
6,25,Little,40,Mixed,Organised,31,36,Balanced,Safe,41,...,51,Press,Cover,67,High,51,Normal,3467,8668,7
7,38,Normal,51,Mixed,Organised,44,51,Balanced,Normal,78,...,44,Press,Cover,47,Medium,53,Normal,3463,8654,19
8,48,Normal,73,Long,Organised,64,73,Fast,Normal,48,...,54,Press,Cover,47,Medium,54,Normal,8021,8197,95
9,43,Normal,40,Mixed,Organised,39,39,Balanced,Normal,39,...,59,Press,Cover,68,High,39,Normal,6504,8466,17
