This code carries out analysis on the two dataframes we have scraped.

In [1]:
import pandas as pd
import numpy as np
from unidecode import unidecode

In [2]:
#Load scraped data
players = pd.read_csv('ratings.csv')
matches = pd.read_csv('matches.csv')

In [3]:
matches.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37,38,39,match_id,home_team,away_team,home_score,away_score,home_formation,away_formation
0,Nick Pope,Ben Mee,Matthew Lowton,James Tarkowski,Erik Pieters,Robbie Brady,Jóhann Gudmundsson,Josh Brownhill,Ashley Westwood,Ashley Barnes,...,Daniel James,Scott McTominay,Mason Greenwood,58896,Burnley,Manchester United,0,1,4-4-2,4-2-3-1
1,Vicente Guaita,Scott Dann,Cheikhou Kouyaté,Joel Ward,Tyrick Mitchell,Andros Townsend,James McCarthy,James McArthur,Jeffrey Schlupp,Wilfried Zaha,...,Michael Obafemi,Buffer,Buffer,58897,Crystal Palace,Southampton,1,0,4-4-2,4-4-2
2,Marek Rodák,Joe Bryan,Denis Odoi,Michael Hector,Tim Ream,Josh Onomah,Ivan Cavaleiro,Neeskens Kebano,Harrison Reed,Tom Cairney,...,Eddie Nketiah,Buffer,Buffer,58898,Fulham,Arsenal,0,3,4-2-3-1,3-4-3
3,Alisson,Joseph Gomez,Trent Alexander-Arnold,Virgil van Dijk,Andrew Robertson,Jordan Henderson,Naby Keïta,Georginio Wijnaldum,Sadio Mané,Mohamed Salah,...,Ian Poveda-Ocampo,Buffer,Buffer,58899,Liverpool,Leeds United,4,3,4-3-3,4-1-4-1
4,Ederson,João Cancelo,John Stones,Rúben Dias,Kyle Walker,Kevin De Bruyne,Rodri,Ilkay Gündogan,Bernardo Silva,Phil Foden,...,Jacob Ramsey,Anwar El Ghazi,Keinan Davis,58900,Manchester City,Aston Villa,2,0,4-3-3,4-2-3-1
5,Hugo Lloris,Ben Davies,Matt Doherty,Toby Alderweireld,Eric Dier,Son Heung-Min,Lucas Moura,Dele Alli,Pierre-Emile Højbjerg,Harry Winks,...,Theo Walcott,Buffer,Buffer,58901,Tottenham Hotspur,Everton,0,1,4-2-3-1,4-3-3
6,Sam Johnstone,Kieran Gibbs,Dara O'Shea,Semi Ajayi,Darnell Furlong,Kyle Bartley,Romaine Sawyers,Jake Livermore,Matheus Pereira,Grady Diangana,...,Kelechi Iheanacho,Buffer,Buffer,58902,West Bromwich Albion,Leicester City,0,3,5-4-1,4-1-4-1
7,Lukasz Fabianski,Aaron Cresswell,Issa Diop,Angelo Ogbonna,Ryan Fredericks,Pablo Fornals,Jarrod Bowen,Mark Noble,Declan Rice,Tomas Soucek,...,Joelinton,Buffer,Buffer,58903,West Ham United,Newcastle United,0,2,4-2-3-1,4-4-2
8,Mat Ryan,Adam Webster,Lewis Dunk,Benjamin White,Adam Lallana,Tariq Lamptey,Solly March,Steven Alzate,Yves Bissouma,Leandro Trossard,...,Olivier Giroud,Buffer,Buffer,58904,Brighton and Hove Albion,Chelsea,1,3,3-5-2,4-2-2-2
9,Aaron Ramsdale,John Egan,Jack O'Connell,Chris Basham,John Fleck,John Lundstram,George Baldock,Oliver Norwood,Enda Stevens,Oliver McBurnie,...,Fábio Silva,Buffer,Buffer,58905,Sheffield United,Wolverhampton Wanderers,0,2,3-5-2,3-4-3


In [4]:
#Sanity checks
#print(matches['home_team'].value_counts())
#print(matches['away_team'].value_counts())
#print(players['Club'].value_counts())

We first get a feel of how the match variables are distributed.

In [5]:
#matches['home_formation'].value_counts()

In [6]:
#matches['away_formation'].value_counts()

In [7]:
#Replace rare/na formations with 'other'
matches = matches.fillna('other')
matches.loc[matches['home_formation'].value_counts()[matches['home_formation']].values < 10, 'home_formation'] = "other"
matches.loc[matches['away_formation'].value_counts()[matches['away_formation']].values < 10, 'away_formation'] = "other"

We need a way of storing the starting and substitute lineups. 
We first try only using numerical ratings, assuming that the positional data is encoded by the formation.

In [8]:
def get_rating(name, club, df):
    
    #First search name as a whole
    try1 = df[df['Name'] == name]
    if try1.shape[0] > 0:
        if(try1.shape[0] == 1): 
            return try1['Rating'].values[0]
        try1_2 = try1[try1['Club'] == club]
        if(try1_2.shape[0] == 1): 
            return try1_2['Rating'].values[0]
        print(try1, club)
    
    #Then use first name initial
    words = name.split(' ')
    if (len(words) >= 2):
        initial = words[0][0]+'.'
        name_condensed = " ".join([initial] + words[1:])
        try2 = df[df['Name'] == name_condensed]
        if (try2.shape[0] > 0):
            if (try2.shape[0] == 1):
                return try2['Rating'].values[0]
            #If more than one option, search using club as well
            try2_2 = try2[try2['Club'] == club]
            if (try2_2.shape[0] == 1):
                return try2_2['Rating'].values[0]
            print(try2, club)
        
        #Use last name only (Spanish players?)
        last_name = " ".join(words[1:])
        last_name = last_name[0].upper() + last_name[1:]
        try3 = df[df['Name'] == last_name]
        if (try3.shape[0] > 0):
            if (try3.shape[0] == 1):
                return try3['Rating'].values[0]
            try3_2 = try3[try3['Club'] == club]
            if (try3_2.shape[0] == 1):
                return try3_2['Rating'].values[0]
            print(try3, club)
        
        #Use first name only
        first_name = words[0]
        try4 = df[df['Name'] == first_name]
        if (try4.shape[0] > 0):
            if (try4.shape[0] == 1):
                return try4['Rating'].values[0]
            try4_2 = try4[try4['Club'] == club]
            if (try4_2.shape[0] == 1):
                return try4_2['Rating'].values[0]
            print(try4, club)
        
        #print(name)
        #print(first_name)
        #print(last_name)
        #print(name_condensed)
    
    #After this, >90% of the names are matched. We fill the rest with mean
    return 0

Before this we try and convert the lineups to a more friendly format.

In [9]:
ratings = np.zeros((matches.shape[0], 40))
for i in range(matches.shape[0]):
    for j in range(40):
        if (j < 20):
            club = matches.loc[i, 'home_team']
        else:
            club = matches.loc[i, 'away_team']
        #Just one player left
        if (club == 'Brighton and Hove Albion'):
            club = "Brighton & Hove Albion"
        ratings[i,j] = get_rating(unidecode(matches.loc[i, str(j)]), club, players)

In [10]:
#Problem 1: players arriving in January (solved by importing ratings from a later version)
#Problem 2: Non-english characters in name (solved by unidecode)

In [11]:
#print(ratings[2,])

In [12]:
#print(matches.loc[2, :])

In [13]:
#There are still some values not available, but it's good enough.
print(np.sum(ratings == 0))

996


In [14]:
#Fill the 0 values with mean of that match
means = np.true_divide(ratings.sum(axis = 1), (ratings != 0).sum(axis = 1)).reshape(-1, 1)
replaced_ratings = ratings + (ratings == 0) * means
clean_df = pd.DataFrame(replaced_ratings)
columns = []
for i in range(40):
    columns.append(str(i))
clean_df.columns = columns
clean_df.head()
#replaced_ratings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,82.0,82.0,75.0,80.0,74.0,75.0,73.0,72.0,78.0,77.0,...,84.0,80.0,73.0,84.0,81.0,79.0,83.0,77.0,79.0,77.0
1,80.0,73.0,77.0,76.0,54.0,77.0,76.0,75.0,74.0,66.0,...,80.0,76.0,77.0,62.0,71.0,61.0,82.0,69.0,73.638889,73.638889
2,74.0,83.0,71.0,72.0,70.0,67.0,75.0,56.0,73.0,76.0,...,87.0,71.0,78.0,75.0,80.0,71.0,80.0,71.0,75.514286,75.514286
3,90.0,83.0,87.0,90.0,65.0,86.0,81.0,85.0,90.0,81.0,...,74.0,74.0,57.0,82.0,73.0,80.0,70.0,76.828571,76.828571,76.828571
4,88.0,90.0,80.0,82.0,70.0,82.0,81.0,71.0,87.0,79.0,...,82.0,78.0,56.0,64.0,76.0,77.076923,75.0,55.0,71.0,68.0


Now that we've got the rating data, we merge it with the team, formation and score to get the dataframe we do processing on.

In [15]:
team_formation = matches[['home_team', 'away_team', 'home_formation', 'away_formation']]
scores = matches[['home_score', 'away_score']]
df = pd.concat([team_formation, clean_df, scores], axis = 1)
#df.head(10)

We carry out normalisation and one-hot encoding.

In [25]:
#Split into train and validation
df_train = df.sample(frac=0.7, random_state=0)
df_valid = df.drop(df_train.index)

#Scale
from sklearn.preprocessing import StandardScaler
df_train_num = df_train.select_dtypes(exclude=['object'])
df_valid_num = df_valid.select_dtypes(exclude=['object'])
num_columns = df_train_num.columns
scaler = StandardScaler()
df_train_num2 = pd.DataFrame(scaler.fit_transform(df_train_num), columns = num_columns)
df_valid_num2 = pd.DataFrame(scaler.transform(df_valid_num), columns = num_columns)

#One-hot
from sklearn.preprocessing import OneHotEncoder
df_train_cat = df_train.select_dtypes(['object'])
df_valid_cat = df_valid.select_dtypes(['object'])
one_hot = OneHotEncoder(sparse = False)
print(one_hot.fit_transform(df_train_cat).shape)
df_train_cat2 = pd.DataFrame(one_hot.fit_transform(df_train_cat), columns = one_hot.get_feature_names())
df_valid_cat2 = pd.DataFrame(one_hot.transform(df_valid_cat), columns = one_hot.get_feature_names())

#Put the dfs together
df_train2 = pd.concat([df_train_num2, df_train_cat2], axis = 1)
df_valid2 = pd.concat([df_valid_num2, df_valid_cat2], axis = 1)

(266, 62)


In [51]:
X_train = df_train2.drop(['home_score', 'away_score'], axis = 1).astype(np.float32) #For PyTorch
X_valid = df_valid2.drop(['home_score', 'away_score'], axis = 1).astype(np.float32)
y_train = df_train2[['home_score', 'away_score']].astype(np.float32)
y_valid = df_valid2[['home_score', 'away_score']].astype(np.float32)

We use PyTorch to train a neural network model. First convert the dataset into a torch tensor format.

In [37]:
import torch

In [44]:
from torch.utils.data import Dataset, DataLoader
class FootballDataset(Dataset):
    def __init__(self, x_df, y_df):
        self.x_df = x_df
        self.y_df = y_df
    
    def __len__(self):
        return self.x_df.shape[0]
    
    def __getitem__(self, idx):
        info = torch.as_tensor(self.x_df.loc[idx, :].values)
        scores = torch.as_tensor(self.y_df.loc[idx, :].values)
        return info, scores

In [52]:
train_dataset = FootballDataset(X_train, y_train)
valid_dataset = FootballDataset(X_valid, y_valid)
train_dataloader = DataLoader(train_dataset, batch_size = 4, shuffle = True)
valid_dataloader = DataLoader(valid_dataset, batch_size = 4, shuffle = True)

Now we create a simple neural network model.

In [65]:
from torch import nn
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(102, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 2),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork()

In [66]:
#Hyperparameters
learning_rate = 1e-3
batch_size = 4
epochs = 100

In [67]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    total_loss = 0
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss
        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    total_loss /= num_batches
    return total_loss

def valid_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    valid_loss = 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            valid_loss += loss_fn(pred, y).item()

    valid_loss /= num_batches
    print(f"Test Error: \n Avg loss: {valid_loss:>8f} \n")
    return valid_loss

In [68]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
train_loss = []
valid_loss = []
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loss.append(train_loop(train_dataloader, model, loss_fn, optimizer))
    valid_loss.append(valid_loop(valid_dataloader, model, loss_fn))
print("Done!")

Epoch 1
-------------------------------
loss: 1.040676  [    0/  266]
Test Error: 
 Avg loss: 0.909515 

Epoch 2
-------------------------------
loss: 0.396998  [    0/  266]
Test Error: 
 Avg loss: 0.926630 

Epoch 3
-------------------------------
loss: 0.773565  [    0/  266]
Test Error: 
 Avg loss: 0.904246 

Epoch 4
-------------------------------
loss: 0.897111  [    0/  266]
Test Error: 
 Avg loss: 0.934733 

Epoch 5
-------------------------------
loss: 0.688482  [    0/  266]
Test Error: 
 Avg loss: 0.909725 

Epoch 6
-------------------------------
loss: 0.673862  [    0/  266]
Test Error: 
 Avg loss: 0.904208 

Epoch 7
-------------------------------
loss: 0.854714  [    0/  266]
Test Error: 
 Avg loss: 0.897071 

Epoch 8
-------------------------------
loss: 1.003249  [    0/  266]
Test Error: 
 Avg loss: 0.888345 

Epoch 9
-------------------------------
loss: 1.274527  [    0/  266]
Test Error: 
 Avg loss: 0.895979 

Epoch 10
-------------------------------
loss: 0.555019

Test Error: 
 Avg loss: 0.957564 

Epoch 79
-------------------------------
loss: 0.643794  [    0/  266]
Test Error: 
 Avg loss: 0.961512 

Epoch 80
-------------------------------
loss: 0.441374  [    0/  266]
Test Error: 
 Avg loss: 0.961888 

Epoch 81
-------------------------------
loss: 0.815199  [    0/  266]
Test Error: 
 Avg loss: 0.966960 

Epoch 82
-------------------------------
loss: 1.174533  [    0/  266]
Test Error: 
 Avg loss: 0.974239 

Epoch 83
-------------------------------
loss: 0.403986  [    0/  266]
Test Error: 
 Avg loss: 0.973675 

Epoch 84
-------------------------------
loss: 0.763809  [    0/  266]
Test Error: 
 Avg loss: 0.973196 

Epoch 85
-------------------------------
loss: 0.272159  [    0/  266]
Test Error: 
 Avg loss: 0.988143 

Epoch 86
-------------------------------
loss: 0.251273  [    0/  266]
Test Error: 
 Avg loss: 0.978097 

Epoch 87
-------------------------------
loss: 0.676591  [    0/  266]
Test Error: 
 Avg loss: 1.081561 

Epoch 88
--

In [62]:
import matplotlib.pyplot as plt
print(len(train_loss))
#plt.Figure()
#plt.plot(train_loss)
#plt.plot(valid_loss)
#plt.show()

[array(0.05011411, dtype=float32)]
