## Notebook to Find the Best Parameters for the Model

In [2]:
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from pathlib import Path
import pandas as pd
from io import StringIO
import numpy as np
import networkx as nx

In [3]:
df = pd.read_csv('final_2023.csv')
accuracy_list = []
teams = df['Home'].unique()
G = nx.DiGraph()
cumulative_results = pd.DataFrame()

In [4]:
column_renames = {
    'Home_Score': 'H_Score',
    'Away_Score': 'A_Score',
    'HxG' : 'H_xG',
    'AxG' : 'A_xG'
}

# Apply the renaming
df = df.rename(columns=column_renames)

In [5]:
df['Cost'] = df['H_xG'] - .9* df['A_xG']

In [6]:
numeric_columns = list(df.select_dtypes(include=['number']).columns)  # Select numeric columns
numeric_columns.remove('Wk')
numeric_columns.remove('Cost')
numeric_columns.remove('Match_ID')
numeric_columns.remove('H_Min')

In [22]:
df[numeric_columns] = df[numeric_columns].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
variable_col = [var[1:] for var in numeric_columns]
variable_col

['_xG',
 '_xG',
 '_Score',
 '_Score',
 '_Performance_Gls',
 '_Performance_Ast',
 '_Performance_PK',
 '_Performance_PKatt',
 '_Performance_Sh',
 '_Performance_SoT',
 '_Performance_CrdY',
 '_Performance_CrdR',
 '_Performance_Touches',
 '_Performance_Tkl',
 '_Performance_Int',
 '_Performance_Blocks',
 '_Expected_xG',
 '_Expected_npxG',
 '_Expected_xAG',
 '_SCA_SCA',
 '_SCA_GCA',
 '_Passes_Cmp',
 '_Passes_Att',
 '_Passes_Cmp%',
 '_Passes_PrgP',
 '_Carries_Carries',
 '_Carries_PrgC',
 '_Take-Ons_Att',
 '_Take-Ons_Succ',
 '_Total_TotDist',
 '_Total_PrgDist',
 '_Short_Cmp',
 '_Short_Att',
 '_Short_Cmp%',
 '_Medium_Cmp',
 '_Medium_Att',
 '_Medium_Cmp%',
 '_Long_Cmp',
 '_Long_Att',
 '_Long_Cmp%',
 '_Passing_xA',
 '_Passing_KP',
 '_Passing_1/3',
 '_Passing_PPA',
 '__CrsPA',
 '_Pass Types_Live',
 '_Pass Types_Dead',
 '_Pass Types_FK',
 '_Pass Types_TB',
 '_Pass Types_Sw',
 '_Pass Types_Crs',
 '_Pass Types_TI',
 '_Pass Types_CK',
 '_Corner Kicks_In',
 '_Corner Kicks_Out',
 '_Corner Kicks_Str',
 '_

In [46]:
from collections import Counter
counts = Counter(variable_col)
counts

# Keep only elements with more than one occurrence
filtered_list = [item for item in variable_col if counts[item] > 1]
len(filtered_list)
filtered_list = filtered_list[:50]

In [47]:
sorted(filtered_list)
variable_col = filtered_list

In [48]:
def project_winner(row, rankings):
    home_rank = rankings.get(row["Home"], float('inf'))
    away_rank = rankings.get(row["Away"], float('inf'))
    
    # Project a tie if ranks are within 3 spots
    if abs(home_rank - away_rank) <= 0:
        return "Draw"
    return "Home" if home_rank > away_rank else "Away"

def page_rank(G, personalization_vec):
    pagerank = nx.pagerank(G, alpha=0.95, personalization=personalization_vec, max_iter=100000, tol=1e-06, nstart=None, weight='weight', dangling=None)
    sorted_by_value = dict(sorted(pagerank.items(), key=lambda item: item[1], reverse=True))
    rankings = {}
    # Print in sorted order by value
    for i, (key, value) in enumerate(sorted_by_value.items(), start=1):
        rankings[key] = i
    
    return rankings

In [49]:
def get_personalization(previous_weeks, variables):
    personalization = {}

# Iterate over each row in the DataFrame
    for index, row in previous_weeks.iterrows():
        home_team = row['Home']
        away_team = row['Away']
        home_personal = 0
        away_personal = 0

        for i in variables:
            home_personal += row['H' + i]
            away_personal += row['A' + i]

            # Add to the existing value if the team is already in the dictionary
            personalization[home_team] = personalization.get(home_team, 0) + home_personal
            personalization[away_team] = personalization.get(away_team, 0) + away_personal 

    return personalization


In [51]:
from itertools import combinations

variables = list(variable_col)
best_accuracy = 0
best_combination = None
accuracy_list = []

# Iterate over combinations of variables
for r in  tqdm(range(1), desc = 'Finding Optimal Parameters'):
    for combination in tqdm(combinations(variables, 2),  desc = 'Finding Optimal Parameters'):
        cumulative_results = pd.DataFrame()
        accuracy_list = []
        
        for week_num in range(4, 10):
            last_three_weeks = [week_num - 3, week_num - 2, week_num - 1]
            previous_weeks = df[df['Wk'].isin(last_three_weeks)]
            
            # Create edgelist
            edgelist = list(zip(previous_weeks['Home'], previous_weeks['Away'], previous_weeks['Cost']))
            mod_edgelist = [
                (t[0], t[1], round(abs(t[2]), 2)) if t[2] < 0 else (t[1], t[0], round(t[2], 2)) 
                for t in edgelist
            ]

            # Add edges to the graph
            G = nx.DiGraph()
            for team1, team2, weight in mod_edgelist:
                G.add_edge(team1, team2, weight=weight)
            
            # Get personalization vector for the current combination
            p = get_personalization(previous_weeks, combination)

            # Compute PageRank
            rankings = page_rank(G, p)

            # Test week predictions and accuracy calculation
            test_week = df[df['Wk'] == week_num].copy()
            test_week["Projected Winner"] = test_week.apply(project_winner, axis=1, rankings=rankings)
            cumulative_results = pd.concat([cumulative_results, test_week], ignore_index=True)

            correct_predictions = (test_week["Projected Winner"] == test_week["Winner"]).sum()
            total_predictions = len(test_week)
            accuracy = (correct_predictions / total_predictions) * 100
            accuracy_list.append(accuracy)

        # Calculate average accuracy for the combination
        avg_accuracy = np.mean(accuracy_list)
        if avg_accuracy > best_accuracy:
            best_accuracy = avg_accuracy
            best_combination = combination
            print(best_accuracy)
            print(combination)

# Print the best combination and its accuracy
print(f"Best combination: {best_combination}, Accuracy: {best_accuracy:.2f}%")


Finding Optimal Parameters:   0%|          | 0/1 [00:00<?, ?it/s]

33.333333333333336
('_xG', '_xG')
35.0
('_xG', '_Score')


Finding Optimal Parameters: 6it [00:00,  7.19it/s]
Finding Optimal Parameters: 100%|██████████| 1/1 [00:00<00:00,  1.19it/s]

Best combination: ('_xG', '_Score'), Accuracy: 35.00%



