## Pitch Predict: English Premier League Match Results Prediction

### Data Sources
*Click to follow links.*

Match Statistics: [matches.jsonl](https://www.kaggle.com/datasets/hugomathien/soccer/data) and [players.csv](https://www.kaggle.com/datasets/hugomathien/soccer/data)

Player Statistics: [All Files in player_statistics folder](https://www.kaggle.com/datasets/davidantonioteixeira/premier-league-player-statistics-1992-2022?resource=download)


### Training Data Construction


### Data Preprocessing


In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('final_dataset.csv')
X = data.drop(['MatchID', 'Result', 'Home Goal Difference'], axis=1)
y = data['Result']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=11)

# The Results column has 3 classes: -1, 0, 1. We will convert them to 0, 1, 2
y_train = y_train.map({-1: 0, 0: 1, 1: 2})
y_test = y_test.map({-1: 0, 0: 1, 1: 2})

X_train_tensor = torch.FloatTensor(X_train)
X_test_tensor = torch.FloatTensor(X_test)
y_train_tensor = torch.LongTensor(y_train.values)
y_test_tensor = torch.LongTensor(y_test.values)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

### Exploratory Data Analysis

### Model Initialization

In [15]:
class MatchPredictor(nn.Module):
    def __init__(self):
        super(MatchPredictor, self).__init__()
        self.layer1 = nn.Linear(X_train.shape[1], 64)
        self.layer2 = nn.Linear(64, 32)
        self.out_layer = nn.Linear(32, 3)  # 3 classes for Home Win, Draw, Home Loss

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        x = self.out_layer(x)
        return x

model = MatchPredictor()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(1000 + 1):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    if epoch % 100 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

Epoch 0, Loss: 1.1168439388275146
Epoch 100, Loss: 0.7335348129272461
Epoch 200, Loss: 0.13779860734939575
Epoch 300, Loss: 0.013172315433621407
Epoch 400, Loss: 0.004608034621924162
Epoch 500, Loss: 0.0023675141856074333
Epoch 600, Loss: 0.0014146986650303006
Epoch 700, Loss: 0.0009380851406604052
Epoch 800, Loss: 0.0006680914666503668
Epoch 900, Loss: 0.0005013535264879465
Epoch 1000, Loss: 0.0003896205744240433


### Model Validation and Selection

In [16]:
model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # Instructs PyTorch not to calculate gradients
    correct = 0
    total = 0
    predictions = []
    for inputs, labels in test_loader:  # Assuming you have a DataLoader for your test set
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        predictions.extend(predicted.tolist())

accuracy = correct / total
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 46.84%


In [17]:
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# Assuming y_test_tensor is your true labels for the test set and predictions is the list from above
y_true = y_test_tensor.numpy()
conf_matrix = confusion_matrix(y_true, predictions)
print(conf_matrix)

[[55 23 43]
 [30 25 41]
 [34 31 98]]


In [18]:
report = classification_report(y_true, predictions, target_names=['Home Loss', 'Draw', 'Home Win'])
print(report)

              precision    recall  f1-score   support

   Home Loss       0.46      0.45      0.46       121
        Draw       0.32      0.26      0.29        96
    Home Win       0.54      0.60      0.57       163

    accuracy                           0.47       380
   macro avg       0.44      0.44      0.44       380
weighted avg       0.46      0.47      0.46       380

