# Training and Testing Process of SNS CW

In [53]:
import numpy as np
import pandas as pd
import os
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import datasets, transforms
from model import *
import torch.optim as optim
import torch.utils.data as Data

## Hyperparameter Setting

In [54]:
LR = 0.0001
Batch_size = 64
hid_dim = 32
epochs = 50

## Training Data Preprocessing

In [55]:
dataframe = pd.read_csv("Datasets/train_set_en.csv")

In [56]:
dataframe

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,...,ATGC_norm,HTGD_norm,ATGD_norm,DiffPts_norm,DiffFormPts_norm,HTP_norm,ATP_norm,FTR_OHE1,FTR_OHE2,FTR_OHE3
0,2009-08-15,Aston Villa,Wigan,0,2,0,0,1,A,11,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,0.0,0.0
1,2009-08-15,Blackburn,Man City,0,2,0,0,1,A,17,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,0.0,0.0
2,2009-08-15,Bolton,Sunderland,0,1,0,0,1,A,11,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,0.0,0.0
3,2009-08-15,Chelsea,Hull,2,1,2,1,1,D,26,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.0
4,2009-08-15,Everton,Arsenal,1,6,0,0,3,A,8,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3795,2019-05-12,Leicester,Chelsea,0,0,1,0,0,D,9,...,0.481481,0.041667,0.347826,-0.338983,-0.076923,0.459459,0.639640,0.0,1.0,0.0
3796,2019-05-12,Liverpool,Wolves,2,0,2,1,0,H,13,...,0.543210,0.902778,0.043478,0.627119,0.384615,0.846847,0.513514,0.0,0.0,1.0
3797,2019-05-12,Man United,Cardiff,0,2,0,0,1,A,26,...,0.851852,0.180556,-0.536232,0.593220,0.153846,0.594595,0.279279,1.0,0.0,0.0
3798,2019-05-12,Southampton,Huddersfield,1,1,1,1,0,H,10,...,0.925926,-0.277778,-0.782609,0.389831,0.307692,0.342342,0.135135,0.0,1.0,0.0


In [57]:
feature_selected = ['HTGS_norm', 'ATGS_norm', 'HTGC_norm', 'ATGC_norm',
                        'HTP_norm', 'ATP_norm', 'HM1', 'AM1', 'HM2', 'AM2',
                        'HM3', 'AM3', 'HM4', 'AM4', 'HM5', 'AM5', 'DiffLP', 'HomeTeamRk', 'AwayTeamRk']



def create_dataset_new(data_in):

    df_selected  = data_in[feature_selected]
    labels_onehot = data_in.iloc[:, -3:].values
    labels = np.argmax(np.stack(labels_onehot), axis=1)  
    return torch.tensor(df_selected.values, dtype=torch.float), torch.LongTensor(labels) 


team_series_data, team_series_label = create_dataset_new(dataframe)

In [58]:
print(team_series_data.shape)
print(team_series_label.shape)

torch.Size([3800, 19])
torch.Size([3800])


In [59]:
team_series_data

tensor([[  0.0000,   0.0000,   0.0000,  ...,  -5.0000,   6.0000,  11.0000],
        [  0.0000,   0.0000,   0.0000,  ...,   5.0000,  15.0000,  10.0000],
        [  0.0000,   0.0000,   0.0000,  ...,  -3.0000,  13.0000,  16.0000],
        ...,
        [  0.6132,   0.3137,   0.6500,  ..., -18.0000,   2.0000,  20.0000],
        [  0.4151,   0.2059,   0.8000,  ...,   1.0000,  17.0000,  16.0000],
        [  0.4811,   0.4706,   0.6875,  ...,   1.0000,  14.0000,  13.0000]])

In [60]:
team_series_label

tensor([0, 0, 0,  ..., 0, 1, 0])

## Training and Validation Process

In [61]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FootballMatchPredictor(input_dim = len(feature_selected), hidden_dim = hid_dim, output_size=3)

In [62]:
def train_new(model, device, team_series_data, team_series_label):
    team_series_data = np.reshape(team_series_data, (3800,1,len(feature_selected)))
    train_dataset = Data.TensorDataset(team_series_data, team_series_label)

    print(team_series_data.shape)
    print(team_series_label.shape)


    train_size = int(0.8 * len(train_dataset))  # 80% for training
    validation_size = len(train_dataset) - train_size  # 20% for validation
    train_dataset, validation_dataset = random_split(train_dataset, [train_size, validation_size])

    train_loader = DataLoader(dataset=train_dataset, batch_size=Batch_size, shuffle=True)
    validation_loader = DataLoader(dataset=validation_dataset, batch_size=Batch_size, shuffle=False)
    
    criterion = nn.CrossEntropyLoss()

    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=LR)
    
    # Training loop
    for epoch in range(epochs):
        model.train()
        correct = 0
        total = 0
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            # print(output.shape)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            
            # if batch_idx % 100 == 0:
            #     print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}')

            _, predicted = torch.max(output.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item() 

        training_accuracy = 100. * correct / total
        print(f'Epoch: {epoch} Training Accuracy: {training_accuracy:.2f}%')


        model.eval()
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for data, target in validation_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                # print(output)
                _, predicted = torch.max(output.data, 1)
                val_total += target.size(0)
                val_correct += (predicted == target).sum().item()

        # Print validation accuracy for the epoch
        validation_accuracy = 100. * val_correct / val_total
        print(f'Epoch: {epoch} Validation Accuracy: {validation_accuracy:.2f}%')
            
    torch.save(model.state_dict(), "model_saved/trained.pth")
    print('Training Complete.')

In [63]:
train_new(model, device, team_series_data, team_series_label)

torch.Size([3800, 1, 19])
torch.Size([3800])
Epoch: 0 Training Accuracy: 24.87%


Epoch: 0 Validation Accuracy: 24.61%
Epoch: 1 Training Accuracy: 33.36%
Epoch: 1 Validation Accuracy: 45.26%
Epoch: 2 Training Accuracy: 45.26%
Epoch: 2 Validation Accuracy: 47.76%
Epoch: 3 Training Accuracy: 48.68%
Epoch: 3 Validation Accuracy: 50.39%
Epoch: 4 Training Accuracy: 50.69%
Epoch: 4 Validation Accuracy: 50.79%
Epoch: 5 Training Accuracy: 51.28%
Epoch: 5 Validation Accuracy: 51.71%
Epoch: 6 Training Accuracy: 51.64%
Epoch: 6 Validation Accuracy: 51.97%
Epoch: 7 Training Accuracy: 52.47%
Epoch: 7 Validation Accuracy: 52.37%
Epoch: 8 Training Accuracy: 52.76%
Epoch: 8 Validation Accuracy: 52.63%
Epoch: 9 Training Accuracy: 52.86%
Epoch: 9 Validation Accuracy: 52.37%
Epoch: 10 Training Accuracy: 51.97%
Epoch: 10 Validation Accuracy: 52.37%
Epoch: 11 Training Accuracy: 52.66%
Epoch: 11 Validation Accuracy: 52.11%
Epoch: 12 Training Accuracy: 52.24%
Epoch: 12 Validation Accuracy: 52.11%
Epoch: 13 Training Accuracy: 52.01%
Epoch: 13 Validation Accuracy: 52.37%
Epoch: 14 Training 

## Testing Process

In [64]:
def test_new(model, device, team_series_data, team_series_label):

    team_series_data = np.reshape(team_series_data, (1803,1,len(feature_selected)))
    # test_dataset = CSVDataset(csv_file_data=args.test_data, csv_file_label=args.test_label)
    # test_loader = DataLoader(dataset=test_dataset, batch_size=args.batch_size, shuffle=True)
    model.eval()
    test_dataset = Data.TensorDataset(team_series_data, team_series_label)

    # print(team_series_label.shape)
    test_loader = DataLoader(
        dataset=test_dataset,
        batch_size=Batch_size,
        shuffle=False
    )


    criterion = nn.CrossEntropyLoss()  
    

    test_loss = 0
    correct = 0

    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss = criterion(output, target)
            test_loss += loss.item()
            _, predicted = torch.max(output.data, 1)
            correct += (predicted == target).sum().item()
            # print(predicted)
    
    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)
    print(f'Accuracy: {correct}/{len(test_loader.dataset)} ({accuracy:.0f}%)')

In [65]:
dataframe_test = pd.read_csv("Datasets/test_set_en.csv")
team_series_data_test, team_series_label_test = create_dataset_new(dataframe_test)

In [66]:
dataframe_test

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,...,ATGC_norm,HTGD_norm,ATGD_norm,DiffPts_norm,DiffFormPts_norm,HTP_norm,ATP_norm,FTR_OHE1,FTR_OHE2,FTR_OHE3
0,2019/8/9,Liverpool,Norwich,4,1,2,4,0,H,15,...,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0,0,1
1,2019/8/10,West Ham,Man City,0,5,0,0,1,A,5,...,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,1,0,0
2,2019/8/10,Bournemouth,Sheffield United,1,1,1,0,0,D,13,...,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0,1,0
3,2019/8/10,Burnley,Southampton,3,0,2,0,0,D,10,...,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0,0,1
4,2019/8/10,Crystal Palace,Everton,0,0,1,0,0,D,6,...,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1798,2024/3/13,Bournemouth,Luton,4,3,2,0,3,A,24,...,0.763889,-0.307692,-0.34,0.379310,0.666667,0.382716,0.246914,0,0,1
1799,2024/3/16,Luton,Nott'm Forest,1,1,1,0,1,A,10,...,0.694444,-0.461538,-0.32,-0.137931,-0.500000,0.246914,0.296296,0,1,0
1800,2024/3/16,Fulham,Tottenham,3,0,2,1,0,H,16,...,0.541667,-0.114286,1.00,-1.000000,-0.500000,0.416667,0.630952,0,0,1
1801,2024/3/16,Burnley,Brentford,2,1,2,1,0,H,17,...,0.722222,-1.000000,-0.60,-0.666667,0.000000,0.166667,0.309524,0,0,1


In [67]:
team_series_data_test

tensor([[  0.0000,   0.0000,   0.0000,  ..., -19.0000,   1.0000,  20.0000],
        [  0.0000,   0.0000,   0.0000,  ...,  14.0000,  16.0000,   2.0000],
        [  0.0000,   0.0000,   0.0000,  ...,   9.0000,  18.0000,   9.0000],
        ...,
        [  0.5882,   0.9365,   0.6667,  ...,   2.0000,  10.0000,   8.0000],
        [  0.3971,   0.6349,   0.9394,  ...,  11.0000,  20.0000,   9.0000],
        [  0.6618,   0.9365,   0.7424,  ...,   7.0000,  14.0000,   7.0000]])

In [68]:
test_new(model, device, team_series_data_test, team_series_label_test)

Accuracy: 945/1803 (52%)
