In [46]:
from bs4 import BeautifulSoup as bs
import numpy as np
import requests
import pandas as pd
import re
from sklearn import preprocessing

In [47]:
base_url = 'https://sofifa.com'

ids = {'Manchester City': 3466, 'Chelsea': 3475, 'Liverpool': 3462, 
       'Manchester United': 3457, 'Tottenham Hotspur': 3470, 'Liverpool': 3459,
       'Everton': 3467, 'Burnley': 4234, 'Leicester City': 8021, 'West Ham United': 3463,
       'Southampton': 6504, 'Crystal Palace': 7261, 'Watford': 8784, 'Fulham': 3474,
       'Newcastle United': 3458, 'Wolverhampton Wanderers': 4225,
       'Brighton & Hove Albion': -1, 'Bournemouth': 8779, 'Huddersfield Town': -2, 
       'Cardiff City': 8344, 'Arsenal': 3459, 'Stoke City': 3472,
       'West Bromwich Albion': 3460, 'Swansea City': 5744}
       
       
team_api_ids = {'Manchester City': 8456, 'Chelsea': 8455, 'Liverpool': 8650, 
                'Manchester United': 10260, 'Tottenham Hotspur': 8586, 'Liverpool': 9825,
                'Everton': 8668, 'Burnley': 8191, 'Leicester City': 8197, 'West Ham United': 8654,
                'Southampton': 8466, 'Crystal Palace': 9826, 'Watford': 9817, 'Fulham': 9879,
                'Newcastle United': 10261, 'Wolverhampton Wanderers': 8602, 
                'Brighton & Hove Albion': -1, 'Bournemouth': 8678, 'Huddersfield Town': -2, 
                'Cardiff City': 7276, 'Arsenal': 9825, 'Stoke City': 10194, 
                'West Bromwich Albion': 8659, 'Swansea City': 10003}

column_headers = ['id', 'team_fifa_api_id', 'date', 'buildUpPlaySpeed', 'buildUpPlaySpeedClass',
                 'buildUpPlayDribbling', 'buildUpPlayDribblingClass',
                 'buildUpPlayPassing', 'buildUpPlayPassingClass', 'buildUpPlayPositioningClass',
                 'chanceCreationPassing', 'chanceCreationPassingClass', 'chanceCreationCrossing', 
                 'chanceCreationCrossingClass',
                 'chanceCreationShooting', 'chanceCreationShootingClass',
                 'chanceCreationPositioningClass', 'defencePressure', 'defencePressureClass',
                 'defenceAggression', 'defenceAggressionClass', 'defenceTeamWidth',
                 'defenceTeamWidthClass', 'defenceDefenderLineClass']

In [48]:
def soup_maker(url):
    r = requests.get(url)
    markup = r.content
    soup = bs(markup, 'lxml')
    return soup


def find_team_links(soup):
    table = soup.find('table', {'class': 'table table-hover persist-area'})  # Table of teams
    tbody = table.find('tbody')
    all_a = tbody.find_all('a', {'class': ''})
    return [base_url + link['href'] + year_extension for link in all_a if '/team/' in link['href']]


def get_team_attributes(soup):
    df_row = pd.DataFrame(columns=column_headers)
    info = soup.find('div', {'class': 'info'}).find('h1').text.split('(')
    team_name = info[0].strip()
    team_fifa_api_id = info[1].split(': ')[1].split(')')[0]
    
    df_row.loc[0, 'id'] = str(ids[team_name])
    df_row.loc[0, 'team_api_id'] = str(team_api_ids[team_name])
    df_row.loc[0, 'team_fifa_api_id'] = team_fifa_api_id
    
    card_div = soup.find('div', {'class': 'card mb-2'})
    attributes_div = card_div.find('div', {'class': 'card-body'})
    data = attributes_div.find_all('dd')
    start = False
    i = 3
    for datum in data:
        category = datum.find('span', {'class': ['tooltip', 'multiline']})
        if start or 'Speed' in category.text:
            start = True
            category = 'Dribbling' if category is None else category.text
            vals = datum.find('span', {'class': 'float-right'}).text
            if category != 'Positioning' and category != 'Defender Line':
                vals = vals.split()
            else:
                vals = [vals]
            for val in vals:
                df_row.loc[0, column_headers[i]] = val
                i += 1 
    return df_row


def get_all_team_attributes(team_urls):
    team_attrs = pd.DataFrame(columns=column_headers)
    for team_url in team_urls:
        team_soup = soup_maker(team_url)
        df_row = get_team_attributes(team_soup)
        team_attrs = team_attrs.append(df_row, ignore_index=True)
    return team_attrs

In [49]:
team_attributes = pd.DataFrame(columns=column_headers)
years = [2018, 2019]
for year in years:
    if year == 2018:
        year_extension = '?lg%5B0%5D=13&v=18&e=158865&set=true' # BPL Fifa 18, Sept 28 2017
        date = '2018-09-28 00:00:00'
    elif year == 2019:
        year_extension = '?lg%5B0%5D=13&v=19&e=159229&set=true'  # BPL Fifa 19, Sept 27 2018
        date = '2019-07-28 00:00:00'
        
    teams_url = base_url + '/teams' + year_extension
    teams_soup = soup_maker(teams_url)
    team_urls = find_team_links(teams_soup)
    df = get_all_team_attributes(team_urls)
    df['date'] = date
    team_attributes = team_attributes.append(df, ignore_index=True)

In [50]:
desired_cols = ['home_team_goal', 'away_team_goal', 'home_team_api_id', 'away_team_api_id',
                  'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'LBH', 'LBD',
                  'LBA', 'PSH', 'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'SJH', 'SJD', 'SJA', 'VCH', 'VCD',
                  'VCA', 'GBH', 'GBD', 'GBA', 'BSH', 'BSD', 'BSA']

def clean_odds_data(odds_data):

    column_mappings = {'FTHG': 'home_team_goal', 'FTAG': 'away_team_goal'}
    odds_data = odds_data.rename(columns=column_mappings)

    # Hacky way to find ID in map from abbreviated name
    def get_api_id(team_name):
        for name, api_id in team_api_ids.items():
            all_found = True
            for piece in team_name.split():
                if piece == 'Wolves':
                    piece = 'Wolverhampton'
                if piece not in name:
                    all_found = False
            if all_found:
                return name

    for i in range(len(odds_data)):
        home_api_id = get_api_id(odds_data.at[i, 'HomeTeam'])
        away_api_id = get_api_id(odds_data.at[i, 'AwayTeam'])
        odds_data.loc[i, 'away_team_api_id'] = str(team_api_ids[home_api_id])
        odds_data.loc[i, 'home_team_api_id'] = str(team_api_ids[away_api_id])
        
    missing_cols = [col for col in desired_cols if col not in odds_data.columns]
    for col in missing_cols:
        odds_data[col] = np.NaN
    odds_data = odds_data[desired_cols]
    return odds_data

In [51]:
odds_17_18 = pd.read_csv('17-18_odds.csv')
odds_18_19 = pd.read_csv('18-19_odds.csv')

odds_17_18 = clean_odds_data(odds_17_18)
odds_18_19 = clean_odds_data(odds_18_19)

In [52]:
def combine_team_and_match_data(matches):
    ### Add blank columns for team attributes to be filled in for each match
    for column in list(team_attributes.columns.values):
        matches['__home_' + column] = np.nan

    for column in list(team_attributes.columns.values):
        matches['__away_' + column] = np.nan

    ### To assist in filling values later (note the underscores leading __underscoes added above & used here 
    ### so we don't collide with existing column names)
    home_column_indexes = [matches.columns.get_loc('__home_' + col_name) for col_name in team_attributes.columns.values]
    away_column_indexes = [matches.columns.get_loc('__away_' + col_name) for col_name in team_attributes.columns.values]

    for index, match in matches.iterrows():
        ### For each match, we find the home and away team, and add their data to the dataframe
        home_team_id = match['home_team_api_id']
        away_team_id = match['away_team_api_id']
        home_team_atts = team_attributes.loc[team_attributes['team_api_id'] == home_team_id]
        away_team_atts = team_attributes.loc[team_attributes['team_api_id'] == away_team_id]    

        matches.iloc[index, home_column_indexes] = home_team_atts.values[0]
        matches.iloc[index, away_column_indexes] = away_team_atts.values[0]
    return matches

In [53]:
combined_odds_17_18 = combine_team_and_match_data(odds_17_18)
combined_odds_18_19 = combine_team_and_match_data(odds_18_19)
combined_odds = combined_odds_17_18.append(combined_odds_18_19, ignore_index=True)

In [54]:
#Enumerate the columns if they have string values
newCol = {}
for col in combined_odds.columns.values:
    if re.search('Class', col):
            enum_dict = { k: v for v, k in dict(enumerate(list(set(combined_odds[col])))).items()}
            newCol[col] = combined_odds[col].map(enum_dict)
            
for colName in newCol.keys():
    combined_odds[colName] = newCol[colName]

In [55]:
#fill in missing data with na with -1
#CHANGE LATER TO BE MORE ROBUST
matches = combined_odds[['B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'LBH', 'LBD', 'LBA', 'PSH', 'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'SJH', 'SJD', 'SJA', 'VCH', 'VCD', 'VCA', 'GBH', 'GBD', 'GBA', 'BSH', 'BSD', 'BSA', '__home_team_fifa_api_id', '__home_buildUpPlaySpeed', '__home_buildUpPlaySpeedClass', '__home_buildUpPlayDribbling', '__home_buildUpPlayDribblingClass', '__home_buildUpPlayPassing', '__home_buildUpPlayPassingClass', '__home_buildUpPlayPositioningClass', '__home_chanceCreationPassing', '__home_chanceCreationPassingClass', '__home_chanceCreationCrossing', '__home_chanceCreationCrossingClass', '__home_chanceCreationShooting', '__home_chanceCreationShootingClass', '__home_chanceCreationPositioningClass', '__home_defencePressure', '__home_defencePressureClass', '__home_defenceAggression', '__home_defenceAggressionClass', '__home_defenceTeamWidth', '__home_defenceTeamWidthClass', '__home_defenceDefenderLineClass', '__away_team_fifa_api_id', '__away_buildUpPlaySpeed', '__away_buildUpPlaySpeedClass', '__away_buildUpPlayDribbling', '__away_buildUpPlayDribblingClass', '__away_buildUpPlayPassing', '__away_buildUpPlayPassingClass', '__away_buildUpPlayPositioningClass', '__away_chanceCreationPassing', '__away_chanceCreationPassingClass', '__away_chanceCreationCrossing', '__away_chanceCreationCrossingClass', '__away_chanceCreationShooting', '__away_chanceCreationShootingClass', '__away_chanceCreationPositioningClass', '__away_defencePressure', '__away_defencePressureClass', '__away_defenceAggression', '__away_defenceAggressionClass', '__away_defenceTeamWidth', '__away_defenceTeamWidthClass', '__away_defenceDefenderLineClass']]
matches = matches.fillna(-1)
matches.to_csv('recent_seasons_unnormalized.csv')

In [56]:
cols_to_normalize = ['__home_buildUpPlaySpeed', '__home_buildUpPlayDribbling',
                     '__home_buildUpPlayPassing',
                     '__home_chanceCreationPassing', '__home_chanceCreationCrossing',
                     '__home_chanceCreationShooting', '__home_defencePressure',
                     '__home_defenceAggression', '__home_defenceTeamWidth',
                     '__away_buildUpPlaySpeed', 
                     '__away_buildUpPlayDribbling', '__away_buildUpPlayPassing',
                     '__away_chanceCreationPassing', '__away_chanceCreationCrossing',
                     '__away_chanceCreationShooting', '__away_defencePressure',
                     '__away_defenceAggression', '__away_defenceTeamWidth']

normed_matches = matches

# Normalize columns
# ###################
for column in cols_to_normalize:
    
    vals = matches[[column]].values.astype(float)
    
    # Create a minimum and maximum processor object
    min_max_scaler = preprocessing.MinMaxScaler()

    # Create an object to transform the data to fit minmax processor
    vals_scaled = min_max_scaler.fit_transform(vals)

    # Run the normalizer on the dataframe
    normed_matches[column] = vals_scaled
    
normed_matches.to_csv('recent_seasons_normalized.csv')
normed_matches

Unnamed: 0,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,...,__away_chanceCreationShooting,__away_chanceCreationShootingClass,__away_chanceCreationPositioningClass,__away_defencePressure,__away_defencePressureClass,__away_defenceAggression,__away_defenceAggressionClass,__away_defenceTeamWidth,__away_defenceTeamWidthClass,__away_defenceDefenderLineClass
0,1.53,4.50,6.50,1.50,4.60,6.75,1.47,4.50,6.50,1.44,...,0.277778,2,0,0.82,1,0.600,1,0.000000,1,0
1,11.00,5.50,1.33,11.00,5.25,1.30,8.00,5.30,1.35,10.00,...,0.444444,2,1,0.36,0,0.200,1,0.411765,2,0
2,1.25,6.50,15.00,1.22,6.50,12.50,1.22,6.20,13.50,1.25,...,0.277778,2,1,0.82,1,0.550,1,0.529412,2,0
3,1.83,3.60,5.00,1.80,3.50,4.75,1.85,3.50,4.30,1.80,...,0.972222,2,1,0.96,1,0.875,1,0.617647,2,0
4,1.70,3.80,5.75,1.70,3.60,5.50,1.70,3.70,5.00,1.67,...,0.277778,2,1,0.82,1,0.550,1,0.529412,2,0
5,1.62,4.00,6.50,1.57,4.00,6.00,1.65,3.80,5.30,1.60,...,0.222222,2,1,0.84,1,0.750,1,0.176471,2,0
6,6.00,4.20,1.62,6.00,4.20,1.55,5.50,4.00,1.60,5.80,...,0.750000,2,1,0.24,0,0.625,1,0.352941,2,0
7,2.40,3.30,3.30,2.40,3.20,3.10,2.30,3.30,3.15,2.40,...,0.944444,2,1,0.00,2,0.000,2,0.000000,1,0
8,1.30,5.75,12.00,1.28,5.50,11.00,1.33,5.30,8.70,1.33,...,0.000000,1,0,0.80,0,0.575,1,0.558824,2,0
9,5.50,4.00,1.70,5.25,3.80,1.67,4.80,3.60,1.75,5.00,...,0.277778,2,1,0.40,0,0.675,1,0.235294,2,0


In [57]:
index = range(0, combined_odds.shape[0]) # number rows
columns = ['Home', 'Draw', 'Away']
new_to_remove = []
labels =  pd.DataFrame(index=index, columns=columns)

for index, match in combined_odds.iterrows():
    if index in combined_odds.index.values:
        if int(match['home_team_goal']) > int(match['away_team_goal']):
            labels.at[index, 'Home'] = 1
        elif int(match['home_team_goal']) == int(match['away_team_goal']):
            labels.at[index, 'Draw'] = 1
        else:
            labels.at[index, 'Away'] = 1
    else:
        new_to_remove.append(index)
        
labels = labels.drop(new_to_remove, axis=0)
labels = labels.fillna(0)
print(labels, labels.shape[0] == combined_odds.shape[0])
assert(labels.index.values.all() == matches.index.values.all())
labels.to_csv('recent_seasons_labels.csv')

     Home  Draw  Away
0       1     0     0
1       0     0     1
2       0     0     1
3       0     0     1
4       1     0     0
5       0     1     0
6       0     1     0
7       1     0     0
8       1     0     0
9       0     0     1
10      0     0     1
11      0     0     1
12      1     0     0
13      1     0     0
14      1     0     0
15      1     0     0
16      0     0     1
17      1     0     0
18      0     0     1
19      0     1     0
20      0     0     1
21      0     0     1
22      0     1     0
23      1     0     0
24      1     0     0
25      0     1     0
26      1     0     0
27      1     0     0
28      0     1     0
29      0     1     0
..    ...   ...   ...
480     0     1     0
481     0     0     1
482     0     0     1
483     1     0     0
484     1     0     0
485     1     0     0
486     0     0     1
487     1     0     0
488     1     0     0
489     1     0     0
490     1     0     0
491     0     0     1
492     0     1     0
493     0 