In [20]:
import glob
import datetime
import pandas as pd
import csv
import torch as th

In [None]:
file_pattern = 'berrar_ratings/data_recent_and_val_*.csv'
files = glob.glob(file_pattern)

full_df = pd.DataFrame()

for file in files:
    df = pd.read_csv(file)
    # change the format of the Date column
    df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
    full_df = pd.concat([full_df, df])

# drop the Sea, Lge, GD, WDL columns
full_df = full_df.drop(columns=['Sea', 'Lge', 'GD', 'WDL'])

In [16]:
# convert the teams to index
teams = full_df['HT'].unique()
teams.sort()

# create a dictionary to map the team names to integers
team_to_idx = {team: idx for idx, team in enumerate(teams)}
idx_to_team = {idx: team for team, idx in team_to_idx.items()}

# add the team index to the dataframe
full_df['HT'] = full_df['HT'].map(team_to_idx)
full_df['AT'] = full_df['AT'].map(team_to_idx)

In [17]:
 # sort the dataframe by date
full_df = full_df.sort_values(by='Date')

split_date = "14/04/2023"
split_date = datetime.datetime.strptime(split_date, "%d/%m/%Y")

df_train = full_df[full_df['Date'] < split_date]
df_val = full_df[full_df['Date'] >= split_date]



In [22]:
print(df_train.head())

        Date    HT    AT  HS  AS  HT_H_Off_Rating  HT_H_Def_Rating  \
2 2019-01-26    50   167   2   1              0.0              0.0   
0 2019-01-26   261   627   2   3              0.0              0.0   
1 2019-01-26   534   618   0   5              0.0              0.0   
3 2019-01-26   749   258   2   1              0.0              0.0   
7 2019-01-27  1049  1052   4   1              0.0              0.0   

   HT_A_Off_Rating  HT_A_Def_Rating  AT_H_Off_Rating  AT_H_Def_Rating  \
2              0.0              0.0              0.0              0.0   
0              0.0              0.0              0.0              0.0   
1              0.0              0.0              0.0              0.0   
3              0.0              0.0              0.0              0.0   
7              0.0              0.0              0.0              0.0   

   AT_A_Off_Rating  AT_A_Def_Rating     HT_EG     AT_EG  
2              0.0              0.0  1.239521  1.145053  
0              0.0      

In [30]:
# convert the dataframe into pytorch tensors

training_teams = th.tensor(df_train[['HT', 'AT']].values, dtype=th.int64)
training_ratings = th.tensor(df_train[['HT_H_Off_Rating', 
                                        'HT_H_Def_Rating',
                                        'HT_A_Off_Rating', 
                                        'HT_A_Def_Rating',
                                        'AT_H_Off_Rating', 
                                        'AT_H_Def_Rating',
                                        'AT_A_Off_Rating',
                                        'AT_A_Def_Rating']].values, dtype=th.float32)

training_data = th.cat([training_teams, training_ratings], dim=1)
training_labels = th.tensor(df_train[['HS', 'AS']].values, dtype=th.float32)

testing_teams = th.tensor(df_val[['HT', 'AT']].values, dtype=th.int64)
testing_ratings = th.tensor(df_val[['HT_H_Off_Rating', 
                                    'HT_H_Def_Rating',
                                    'HT_A_Off_Rating', 
                                    'HT_A_Def_Rating',
                                    'AT_H_Off_Rating', 
                                    'AT_H_Def_Rating',
                                    'AT_A_Off_Rating',
                                    'AT_A_Def_Rating']].values, dtype=th.float32)
testing_data = th.cat([testing_teams, testing_ratings], dim=1)
testing_labels = th.tensor(df_val[['HS', 'AS']].values, dtype=th.float32)

# separate the training set into training and validation sets
split = int(0.8 * training_data.shape[0])
train_data = training_data[:split]
train_labels = training_labels[:split]
val_data = training_data[split:]
val_labels = training_labels[split:]



0.6312885348943215


In [31]:
# training_expected_labels = th.tensor(df_train[['HT_EG', 'AT_EG']].values, dtype=th.float32)

# total = 0
# count = 0

# for i in range(training_labels.shape[0]):
#     home_score = training_labels[i][0]
#     away_score = training_labels[i][1]
#     home_expected_goals = training_expected_labels[i][0]
#     away_expected_goals = training_expected_labels[i][1]
#     if home_score > away_score:
#         total += 1
#         if home_expected_goals > away_expected_goals:
#             count += 1
#     elif home_score < away_score:
#         total += 1
#         if home_expected_goals < away_expected_goals:
#             count += 1
#     else:
#         pass

# print(count/total)

0.6312885348943215


In [None]:
# inception block
class InceptionBlock(nn.Module):
    def __init__(self, in_channels, out_channels, num_kernel=4, init_weights=True):
        super(InceptionBlock, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.num_kernel = num_kernel
        kernels = []
        for i in range(1, num_kernel + 1):
            # kernels.append(nn.Conv2d(in_channels, out_channels // num_kernel, kernel_size=i * 2 + 1, padding=i))
            kernels.append(nn.Conv2d(in_channels, out_channels, kernel_size=i * 2 + 1, padding=i))
        self.kernels = nn.ModuleList(kernels)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        outputs = []
        for i in range(self.num_kernel):
            outputs.append(self.kernels[i](x))
        # outputs = th.stack(outputs, dim=1)
        outputs = th.cat(outputs, dim=1).mean(dim=1)

        # do i want to add a relu here (??)
        return outputs

In [None]:
class LSTMBlock(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(LSTMBlock, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

        
    def forward(self, x):
        x, _ = self.lstm(x)

        # do i want to ude the last layer only (??)
        return x

In [None]:
# configuration of the model
from dataclasses import dataclass

@dataclass
class FootBallPredictionConfig:
    num_teams: int
    num_features: int
    num_labels: int
    num_inception_blocks: int
    num_lstm_layers: int
    hidden_size: int
    output_size: int
    num_epochs: int
    learning_rate: float
    batch_size: int

In [None]:
# needc an embedding layer for the teams before the feedforward network

class FootballPrediction(nn.Module):
    def __init__(self, config):
        super(FootballPrediction, self).__init__()
        self.team_embedding = nn.Embedding(num_teams, 10)
        # inception block
        self.inception_block = [InceptionBlock(num_features + 10, 64) for _ in range(num_inception_blocks)]

        self.fc = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        # would need a module that processes the sequential features
        # self.feedforward = nn.Sequential(
        #     nn.Linear(num_features + 10, 64),
        #     nn.ReLU(),
        #     nn.Linear(64, 64),
        #     nn.ReLU(),
        #     nn.Linear(64, num_labels)
        # )

    def forward(self, x):
        # do we need to computer the berrar ratings here (??) or does the 
        # input already contain the berrar ratings (??)
        
        # replace the team indexes with the embeddings
        home_team = self.team_embedding(x[:, 0].long())
        away_team = self.team_embedding(x[:, 1].long())
        x = th.cat([x[:, 2:], home_team, away_team], dim=1)

        # pass through the inception block
        for block in self.inception_block:
            x = block(x)


        
        return self.feedforward(x)
    

    def predict(self, x):
        # have to decide on what is the shape for the input

        return self.forward(x)
    