In [16]:
import glob
import datetime
import csv

# read the data from the csv files
file_pattern = 'berrar_rating/trainset_22-23_exact_44_*.csv'
files = glob.glob(file_pattern)

raw_data = []

# dictionary to convert the WLD to an integer
wld_to_idx = {'W': 0, 'D': 1, 'L': 2}

for file in files:
    with open(file, 'r') as f:
        reader = csv.reader(f)
        # skip the header
        next(reader)
        for line in reader:
            # conver to datetime object for line[3] that has format "17/08/2019"
            date = datetime.datetime.strptime(line[3], '%d/%m/%Y')
            home_team = line[4]
            away_team = line[5]
            # turn into integers/floats
            home_score = int(line[6])
            away_score = int(line[7])
            goal_diff = int(line[8])
            win_draw_loss = wld_to_idx[line[9]]
            ht_home_offensive_rating = float(line[10])
            ht_home_defensive_rating = float(line[11])
            ht_away_offensive_rating = float(line[12])
            ht_away_defensive_rating = float(line[13])
            at_home_offensive_rating = float(line[14])
            at_home_defensive_rating = float(line[15])
            at_away_offensive_rating = float(line[16])
            at_away_defensive_rating = float(line[17])
            ht_expected_goals = float(line[18])
            at_expected_goals = float(line[19])
            line_information = [date, 
                                home_team, 
                                away_team,
                                home_score,
                                away_score,
                                goal_diff,
                                win_draw_loss,
                                ht_home_offensive_rating,
                                ht_home_defensive_rating,
                                ht_away_offensive_rating,
                                ht_away_defensive_rating,
                                at_home_offensive_rating,
                                at_home_defensive_rating,
                                at_away_offensive_rating,
                                at_away_defensive_rating,
                                ht_expected_goals,
                                at_expected_goals]
            raw_data.append(line_information)

print(f'Found {len(raw_data)} lines in {len(files)} files')


Found 49930 lines in 42 files


In [None]:
# Header has the following format
# index,Sea,Lge,Date,HT,AT,HS,AS,GD,WDL,Round,HT_H_Off_Rating,HT_H_Def_Rating,HT_A_Off_Rating,HT_A_Def_Rating,AT_H_Off_Rating,AT_H_Def_Rating,AT_A_Off_Rating,AT_A_Def_Rating,HT_EG,AT_EG
# where:

# HT_H_Off_Rating - Home team home offensive rating
# HT_H_Def_Rating - Home team home defensive rating
# HT_A_Off_Rating - Home team away offensive rating
# HT_A_Def_Rating - Home team away defensive rating
# AT_H_Off_Rating - Away team home offensive rating
# AT_H_Def_Rating - Away team home defensive rating
# AT_A_Off_Rating - Away team away offensive rating
# AT_A_Def_Rating - Away team away defensive rating
# AT_EG - Away team expected goals
# HT_EG - Home team expected goals

In [17]:
import numpy as np
import pandas as pd
import torch as th
import torch.nn as nn

In [15]:
# sort by date
raw_data.sort(key=lambda x: x[0])

# change the teams to indexes
teams = set([x[1] for x in raw_data] + [x[2] for x in raw_data])
team_to_idx = {team: idx for idx, team in enumerate(teams)}
idx_to_team = {idx: team for team, idx in team_to_idx.items()}

print ("Number of teams: ", len(teams))

features = th.zeros(len(raw_data), 10)
labels = th.zeros(len(raw_data), 4)

# change the teams to indeces
for i in range(len(raw_data)):
    # change the teams to indexes
    home_team_idx = team_to_idx[raw_data[i][1]]
    away_team_idx = team_to_idx[raw_data[i][2]]

    features[i] = th.tensor([
        home_team_idx, away_team_idx, raw_data[i][7],
        raw_data[i][8], raw_data[i][9], raw_data[i][10],
        raw_data[i][11], raw_data[i][12], raw_data[i][13],
        raw_data[i][14]])
    labels[i] = th.tensor([raw_data[i][3], raw_data[i][4], raw_data[i][5], raw_data[i][6]])

# concatenate the features and labels

# split the data into training, validation and test sets
split_1 = int(0.7 * features.size(0))
split_2 = int(0.85 * features.size(0))

training_data = [features[:split_1], labels[:split_1]]
validation_data = [features[split_1:split_2], labels[split_1:split_2]]
test_data = [features[split_2:], labels[split_2:]]

print ("Training data size: ", training_data[0].size(0))
print ("Validation data size: ", validation_data[0].size(0))
print ("Test data size: ", test_data[0].size(0))




Number of teams:  907
Training data size:  34951
Validation data size:  7489
Test data size:  7490


In [None]:
# inception block
class InceptionBlock(nn.Module):
    def __init__(self, in_channels, out_channels, num_kernel=4, init_weights=True):
        super(InceptionBlock, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.num_kernel = num_kernel
        kernels = []
        for i in range(1, num_kernel + 1):
            # kernels.append(nn.Conv2d(in_channels, out_channels // num_kernel, kernel_size=i * 2 + 1, padding=i))
            kernels.append(nn.Conv2d(in_channels, out_channels, kernel_size=i * 2 + 1, padding=i))
        self.kernels = nn.ModuleList(kernels)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        outputs = []
        for i in range(self.num_kernel):
            outputs.append(self.kernels[i](x))
        # outputs = th.stack(outputs, dim=1)
        outputs = th.cat(outputs, dim=1).mean(dim=1)
        return self.relu(outputs)

In [None]:
# needc an embedding layer for the teams before the feedforward network

class FootballPrediction(nn.Module):
    def __init__(self, num_teams, num_features, num_labels, num_inception_blocks=1):
        super(FootballPrediction, self).__init__()
        self.team_embedding = nn.Embedding(num_teams, 10)
        # inception block
        self.inception_block = [InceptionBlock(num_features + 10, 64) for _ in range(num_inception_blocks)]


        # would need a module that processes the sequential features
        # self.feedforward = nn.Sequential(
        #     nn.Linear(num_features + 10, 64),
        #     nn.ReLU(),
        #     nn.Linear(64, 64),
        #     nn.ReLU(),
        #     nn.Linear(64, num_labels)
        # )

    def forward(self, x):
        # do we need to computer the berrar ratings here (??) or does the 
        # input already contain the berrar ratings (??)
        # replace the team indexes with the embeddings
        home_team = self.team_embedding(x[:, 0].long())
        away_team = self.team_embedding(x[:, 1].long())
        x = th.cat([x[:, 2:], home_team, away_team], dim=1)

        # pass through the inception block
        for block in self.inception_block:
            x = block(x)


        
        return self.feedforward(x)
    

    def predict(self, x):
        # have to decide on what is the shape for the input

        return self.forward(x)
    