## Notebook to Find the Best Parameters for the Model

In [10]:
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from pathlib import Path
import pandas as pd
from io import StringIO
import numpy as np
import networkx as nx

In [15]:
df = pd.read_csv('final_2023.csv')
accuracy_list = []
teams = df['Home'].unique()
G = nx.DiGraph()
cumulative_results = pd.DataFrame()

In [28]:
df['cost']

0      0.30
1      0.12
2      1.05
3      1.43
4     -0.37
       ... 
375    0.55
376    0.14
377    1.54
378   -0.48
379   -0.14
Name: cost, Length: 380, dtype: float64

In [33]:
def project_winner(row, rankings):
    home_rank = rankings.get(row["Home"], float('inf'))
    away_rank = rankings.get(row["Away"], float('inf'))
    
    # Project a tie if ranks are within 3 spots
    if abs(home_rank - away_rank) <= 0:
        return "Draw"
    return "Home" if home_rank > away_rank else "Away"
def get_personalization(previous_weeks, variables):
    personalization = {}

    # Calculate the personalization values
    for index, row in previous_weeks.iterrows():
        for var in variables:
            if var.startswith("H_"):
                home_team = row['Home']
                personalization[home_team] = personalization.get(home_team, 0) + row[var]
            elif var.startswith("A_"):
                away_team = row['Away']
                personalization[away_team] = personalization.get(away_team, 0) + row[var]
    
    # Ensure the personalization vector does not sum to zero
    if sum(personalization.values()) == 0:
        # Set a default value if the sum is zero (e.g., 1e-6 for all nodes)
        total_nodes = set(previous_weeks['Home']).union(previous_weeks['Away'])
        for node in total_nodes:
            if node not in personalization:
                personalization[node] = 1e-6  # Small default value for missing teams
    
    # Normalize the personalization vector (so it sums to 1)
    total_sum = sum(personalization.values())
    if total_sum != 0:
        personalization = {key: value / total_sum for key, value in personalization.items()}
    else:
        # If sum is still zero, set equal values for all teams
        for node in personalization:
            personalization[node] = 1 / len(personalization)
    
    return personalization

def page_rank(G, personalization_vec):
    pagerank = nx.pagerank(G, alpha=0.95, personalization=personalization_vec, max_iter=100000, tol=1e-06, nstart=None, weight='weight', dangling=None)
    sorted_by_value = dict(sorted(pagerank.items(), key=lambda item: item[1], reverse=True))
    rankings = {}
    # Print in sorted order by value
    for i, (key, value) in enumerate(sorted_by_value.items(), start=1):
        rankings[key] = i
    
    return rankings

In [34]:
numeric_columns = df.select_dtypes(include=['number']).columns  # Select numeric columns

In [35]:
list(numeric_columns)

['Wk',
 'HxG',
 'AxG',
 'Match_ID',
 'Home_Score',
 'Away_Score',
 'H_Min',
 'H_Performance_Gls',
 'H_Performance_Ast',
 'H_Performance_PK',
 'H_Performance_PKatt',
 'H_Performance_Sh',
 'H_Performance_SoT',
 'H_Performance_CrdY',
 'H_Performance_CrdR',
 'H_Performance_Touches',
 'H_Performance_Tkl',
 'H_Performance_Int',
 'H_Performance_Blocks',
 'H_Expected_xG',
 'H_Expected_npxG',
 'H_Expected_xAG',
 'H_SCA_SCA',
 'H_SCA_GCA',
 'H_Passes_Cmp',
 'H_Passes_Att',
 'H_Passes_Cmp%',
 'H_Passes_PrgP',
 'H_Carries_Carries',
 'H_Carries_PrgC',
 'H_Take-Ons_Att',
 'H_Take-Ons_Succ',
 'H_Total_TotDist',
 'H_Total_PrgDist',
 'H_Short_Cmp',
 'H_Short_Att',
 'H_Short_Cmp%',
 'H_Medium_Cmp',
 'H_Medium_Att',
 'H_Medium_Cmp%',
 'H_Long_Cmp',
 'H_Long_Att',
 'H_Long_Cmp%',
 'H_Passing_xA',
 'H_Passing_KP',
 'H_Passing_1/3',
 'H_Passing_PPA',
 'H__CrsPA',
 'H_Pass Types_Live',
 'H_Pass Types_Dead',
 'H_Pass Types_FK',
 'H_Pass Types_TB',
 'H_Pass Types_Sw',
 'H_Pass Types_Crs',
 'H_Pass Types_TI',
 

In [None]:
from itertools import combinations

variables = list(numeric_columns)
best_accuracy = 0
best_combination = None
accuracy_list = []

# Iterate over combinations of variables
for r in  tqdm(range(1, len(variables) + 1), desc = 'Finding Optimal Parameters'):
    for combination in tqdm(combinations(variables, 2),  desc = 'Finding Optimal Parameters'):
        cumulative_results = pd.DataFrame()
        accuracy_list = []
        
        for week_num in range(4, 6):
            last_three_weeks = [week_num - 3, week_num - 2, week_num - 1]
            previous_weeks = df[df['Wk'].isin(last_three_weeks)]
            
            # Create edgelist
            edgelist = list(zip(previous_weeks['Home'], previous_weeks['Away'], previous_weeks['cost']))
            mod_edgelist = [
                (t[0], t[1], round(abs(t[2]), 2)) if t[2] < 0 else (t[1], t[0], round(t[2], 2)) 
                for t in edgelist
            ]

            # Add edges to the graph
            G = nx.DiGraph()
            for team1, team2, weight in mod_edgelist:
                G.add_edge(team1, team2, weight=weight)
            
            # Get personalization vector for the current combination
            p = get_personalization(previous_weeks, combination)

            # Compute PageRank
            rankings = page_rank(G, p)

            # Test week predictions and accuracy calculation
            test_week = df[df['Wk'] == week_num].copy()
            test_week["Projected Winner"] = test_week.apply(project_winner, axis=1, rankings=rankings)
            cumulative_results = pd.concat([cumulative_results, test_week], ignore_index=True)

            correct_predictions = (test_week["Projected Winner"] == test_week["Winner"]).sum()
            total_predictions = len(test_week)
            accuracy = (correct_predictions / total_predictions) * 100
            accuracy_list.append(accuracy)

        # Calculate average accuracy for the combination
        avg_accuracy = np.mean(accuracy_list)
        if avg_accuracy > best_accuracy:
            best_accuracy = avg_accuracy
            best_combination = combination
            print(best_accuracy)

# Print the best combination and its accuracy
print(f"Best combination: {best_combination}, Accuracy: {best_accuracy:.2f}%")


Finding Optimal Parameters:   0%|          | 0/189 [00:00<?, ?it/s]

30.0
35.0
40.0
45.0


Finding Optimal Parameters:   0%|          | 0/189 [01:00<?, ?it/s]


KeyboardInterrupt: 