# Sports Dataset

10,000 random photos from soccer matches captured at a single timestamp make up the dataset. Every snapshot has a number of elements that offer details on the current match. The objective of the dataset is to capture the evolving dynamics of a soccer game, and to make predictions for the number of goals that will be scored for the remaining playing time.

# Dataset Structure

The dataset is presented in a single file with 10,000 rows and 20 columns, each row representing a snapshot of a soccer match, and each column representing a specific attribute or feature of that snapshot.

Number of Instances (Rows): 10,000 instances 
Number of Features (Columns): 20 features/columns 

# Feature Representation

The features include identifiers such as uuid, and dynamic match-related details such as the current minute of play, home and away team scores, yellow and red cards for both teams, the number of attacks and dangerous attacks by each team, corners awarded, shots on and off target for both teams, and the ball possession percentage for both the home and away teams until the current minute.

# Libraries

- pandas
- numpy
- matplotlib
- pickle
- os
- sklearn.neighbors KNeighborsClassifier
- sklearn.metrics accuracy_score, classification_report, 
- sklearn.model_selection cross_val_predict, cross_val_score, train_test_split
- sklearn preprocessing 

In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
import os
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset, random_split

In [2]:
pd.set_option("display.expand_frame_repr", True)
pd.set_option('display.colheader_justify','right')
pd.set_option('display.width', 100)
pd.set_option("display.max_colwidth", 30)
sports_df = pd.read_csv(filepath_or_buffer = 'sports.csv', header = 0)
pd.set_option("display.max_rows", sports_df.shape[0])
sports_df.sort_values(by = "current_minute", axis = 0)

Unnamed: 0,uuid,current_minute,home_score,away_score,home_yellow_cards,away_yellow_cards,home_red_cards,away_red_cards,home_attacks,away_attacks,...,away_dangerous_attacks,home_corners,away_corners,home_off_target,away_off_target,home_on_target,away_on_target,home_possession,away_possession,final_delta
1071,f36ec793-d162-437c-acb1-e6...,-348,0,3,1,2,0,0,90,50,...,25,4,3,12,2,5,4,60,40,1
3360,6bbf7f66-3e0c-4f4f-a258-05...,-253,0,0,0,0,0,0,34,35,...,7,4,2,0,0,3,1,55,45,0
4989,6868d172-df91-4db2-835e-d1...,-253,0,0,0,0,0,0,34,35,...,7,4,2,0,0,3,1,55,45,0
5446,b08075cf-3c52-4703-818c-95...,-219,0,0,0,0,0,0,44,47,...,10,4,4,0,2,3,1,53,47,0
5410,0d3426bd-a197-4ca6-815a-c8...,-139,0,0,0,0,0,0,9,2,...,1,0,0,0,0,0,0,75,25,3
1980,e98853d5-db8f-4d93-882b-78...,-96,1,0,0,1,0,0,23,18,...,10,0,0,1,0,1,0,58,42,1
5760,79fa691c-284f-4921-88b3-59...,-12,0,0,0,0,0,0,46,56,...,35,5,2,3,5,0,1,45,55,2
4639,a1f36e6d-1b3e-4c76-87fd-6d...,-7,0,1,0,0,0,0,23,24,...,14,2,0,2,1,0,2,58,42,2
5734,2ec324f7-7776-4b50-96a3-99...,-2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,100,2
4863,5c5d6983-a830-4756-a210-96...,-1,0,0,0,0,0,0,5,2,...,1,0,0,0,0,0,0,50,50,1


# Data Preprocessing

In [3]:
sports_df.drop(['uuid'], axis=1).head()

Unnamed: 0,current_minute,home_score,away_score,home_yellow_cards,away_yellow_cards,home_red_cards,away_red_cards,home_attacks,away_attacks,home_dangerous_attacks,away_dangerous_attacks,home_corners,away_corners,home_off_target,away_off_target,home_on_target,away_on_target,home_possession,away_possession,final_delta
0,94,1,2,0,0,0,0,39,45,29,26,4,1,4,0,4,6,48,52,1
1,91,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,50,50,1
2,3,0,0,0,0,0,0,3,1,1,0,0,0,0,0,0,0,62,38,4
3,17,0,0,1,0,0,0,41,46,11,25,1,0,0,2,0,0,55,45,2
4,76,1,0,1,0,0,0,69,78,21,46,1,1,0,3,4,4,45,55,0


In [4]:
sports_df['actual_match_time'] = sports_df['current_minute'].apply(lambda x: x - 15 if x > 15 else x)
sports_df = sports_df.dropna()

In [5]:
X = sports_df.drop(['actual_match_time'], axis=1)  # Features
y = sports_df['actual_match_time']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Exploratory Data Analysis

# Model Training

## Neural Network

In [6]:
# convert the data from the dataframe to pytorch tensors
X = sports_df.drop(['uuid'], axis=1)
X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32).view(-1, 1)

In [7]:
# split train-validation-test
dataset = TensorDataset(X_tensor, y_tensor)
train_size = int(0.7 * len(dataset))
val_size = (len(dataset) - train_size) // 2
test_size = len(dataset) - train_size - val_size
train_data, val_data, test_data = random_split(dataset, [train_size, val_size, test_size])

In [8]:
batch_size = 32
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

In [9]:
# neural network
class RegressionNN(nn.Module):
    def __init__(self, input_size):
        super(RegressionNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [10]:
# instantiating the model, loss function, and optimizer
input_size = X.shape[1]
model = RegressionNN(input_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [11]:
# model training
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

    # validation
    model.eval()
    with torch.no_grad():
        val_loss = 0.0
        for inputs, targets in val_loader:
            outputs = model(inputs)
            val_loss += criterion(outputs, targets)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Validation Loss: {val_loss/len(val_loader)}')

Epoch 1/50, Loss: 38.932029724121094, Validation Loss: 56.01861572265625
Epoch 2/50, Loss: 1.505285620689392, Validation Loss: 9.148215293884277
Epoch 3/50, Loss: 2.1072185039520264, Validation Loss: 5.790160655975342
Epoch 4/50, Loss: 10.10545825958252, Validation Loss: 4.571538925170898
Epoch 5/50, Loss: 2.7055606842041016, Validation Loss: 3.1504299640655518
Epoch 6/50, Loss: 0.3030664324760437, Validation Loss: 2.554990530014038
Epoch 7/50, Loss: 0.456539511680603, Validation Loss: 1.8199822902679443
Epoch 8/50, Loss: 0.17136618494987488, Validation Loss: 1.715239405632019
Epoch 9/50, Loss: 0.4104134142398834, Validation Loss: 1.3643138408660889
Epoch 10/50, Loss: 0.14978082478046417, Validation Loss: 1.172978162765503
Epoch 11/50, Loss: 0.24378885328769684, Validation Loss: 1.0843372344970703
Epoch 12/50, Loss: 0.7309632301330566, Validation Loss: 0.9967719316482544
Epoch 13/50, Loss: 1.452566146850586, Validation Loss: 0.839698314666748
Epoch 14/50, Loss: 1.8066715002059937, Vali

In [21]:
# evaluation (training set)
model.eval()
predictions = []
targets_list = []
with torch.no_grad():
    for inputs, targets in train_loader:
        outputs = model(inputs)
        predictions.extend(outputs.numpy())
        targets_list.extend(targets.numpy())

train_accuracy = accuracy_score(targets_list, np.round(predictions))
train_loss = 1-train_accuracy
print(f'Training Loss: {train_loss}')
print(f'Training Accuracy:{train_accuracy}')


Training Loss: 0.02897959183673471
Training Accuracy:0.9710204081632653


In [23]:
# evaluation (test set)
model.eval()
predictions = []
targets_list = []
with torch.no_grad():
    for inputs, targets in test_loader:
        outputs = model(inputs)
        predictions.extend(outputs.numpy())
        targets_list.extend(targets.numpy())
test_accuracy = accuracy_score(targets_list, np.round(predictions))
test_loss = 1 - test_accuracy
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy}')

Test Loss: 0.033333333333333326
Test Accuracy: 0.9666666666666667


# Hyperparameter Tuning