The code imports necessary libraries for data manipulation, machine learning, and evaluation. It reads a CSV file named "results.csv" into a pandas DataFrame, converts string columns to lowercase, and filters the data to include only matches from 2003 onwards.
It then creates a new column called 'match_outcome' based on the comparison of 'home_score' and 'away_score' columns, assigning 'home_win', 'away_win', or 'draw' accordingly. Finally, it prints the value counts (frequency) of the 'match_outcome' column, showing how many instances of each outcome exist in the filtered data.

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


data = pd.read_csv("results.csv")

data = data.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
data['date'] = pd.to_datetime(data['date'], errors='coerce')
data = data[data['date'].dt.year >= 2009]

data['match_outcome'] = data.apply(
    lambda row: 'home_win' if row['home_score'] > row['away_score'] else ('away_win' if row['home_score'] < row['away_score'] else 'draw'),
    axis=1
)
print(data['match_outcome'].value_counts())

match_outcome
home_win    6926
away_win    4177
draw        3403
Name: count, dtype: int64


In [32]:
data = pd.get_dummies(data, columns=['home_team', 'away_team'], drop_first=True)
data.head(2)

Unnamed: 0,date,home_score,away_score,tournament,city,country,neutral,match_outcome,home_team_afghanistan,home_team_albania,...,away_team_western isles,away_team_western sahara,away_team_yemen,away_team_ynys môn,away_team_yorkshire,away_team_zambia,away_team_zanzibar,away_team_zimbabwe,away_team_åland,away_team_åland islands
32620,2009-01-01,4.0,0.0,cecafa cup,kampala,uganda,False,home_win,False,False,...,False,False,False,False,False,False,False,False,False,False
32621,2009-01-01,2.0,0.0,cecafa cup,kampala,uganda,True,home_win,False,False,...,False,False,False,False,False,False,False,False,False,False


The code creates two new DataFrames: X and y. X is created by dropping several columns ('date', 'home_score', 'away_score', 'match_outcome', 'tournament', 'city', 'country') from the original data DataFrame. y is created by assigning the 'match_outcome' column to it.
The 'match_outcome' column in y is then mapped to numerical values: 1 for 'home_win', 0 for 'draw', and -1 for 'away_win'.
The train_test_split function from scikit-learn is used to split the X and y DataFrames into training and test sets. The test_size parameter is set to 0.3, meaning 30% of the data will be used for testing, and the remaining 70% for training. The random_state parameter is set to 19 to ensure reproducibility.
A Random Forest Classifier model (rf_model) is instantiated with 100 estimators (decision trees) and a random state of 29.
The fit method is called on rf_model to train the model using the training data (X_train and y_train).
The trained model is used to make predictions on the test data (X_test) using the predict method, and the predictions are stored in y_pred.
Finally, the accuracy of the model is calculated by comparing the predicted values (y_pred) with the actual values (y_test) using the accuracy_score function from scikit-learn, and the result is printed.

In [33]:
X = data.drop(columns=['date', 'home_score', 'away_score', 'match_outcome', 'tournament', 'city', 'country'])
y = data['match_outcome']
y = y.map({'home_win': 1, 'draw': 0, 'away_win': -1})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=19)

rf_model = RandomForestClassifier(n_estimators=100, random_state=29)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.5085018382352942


In [34]:
groups = {
    "Group A": ["germany", "hungary", "scotland", "switzerland"],
    "Group B": ["albania", "croatia", "italy", "spain"],
    "Group C": ["denmark", "england", "serbia", "slovenia"],
    "Group D": ["austria", "france", "netherlands", "poland"],
    "Group E": ["belgium", "romania", "slovakia", "ukraine"],
    "Group F": ["czech republic", "portugal", "turkey", "georgia"]
}

In [35]:
from itertools import combinations

def simulate_group_matches(group_teams, model, data_columns):
    points = {team: 0 for team in group_teams}

    for home_team, away_team in combinations(group_teams, 2):
        match_data = pd.DataFrame(columns=data_columns)

        for column in data_columns:
            if f'home_team_{home_team}' in column:
                match_data.at[0, column] = 1
            elif f'away_team_{away_team}' in column:
                match_data.at[0, column] = 1
            else:
                match_data.at[0, column] = 0

        prediction = model.predict(match_data)


        if prediction == 1:
            points[home_team] += 3
        elif prediction == -1:
            points[away_team] += 3
        else:
            points[home_team] += 1
            points[away_team] += 1

    ranked_teams = sorted(points.items(), key=lambda x: x[1], reverse=True)
    return ranked_teams

In [36]:
import pprint

group_rankings = {}
for group_name, group_teams in groups.items():
    ranked_teams = simulate_group_matches(group_teams, rf_model, X.columns)
    group_rankings[group_name] = ranked_teams

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(group_rankings)

{   'Group A': [   ('germany', 7),
                   ('switzerland', 4),
                   ('hungary', 3),
                   ('scotland', 3)],
    'Group B': [('croatia', 7), ('italy', 5), ('spain', 4), ('albania', 0)],
    'Group C': [('denmark', 7), ('england', 7), ('serbia', 3), ('slovenia', 0)],
    'Group D': [   ('france', 7),
                   ('netherlands', 6),
                   ('poland', 3),
                   ('austria', 1)],
    'Group E': [   ('belgium', 9),
                   ('slovakia', 4),
                   ('ukraine', 3),
                   ('romania', 1)],
    'Group F': [   ('turkey', 9),
                   ('portugal', 6),
                   ('czech republic', 3),
                   ('georgia', 0)]}


As per EURO2024 defined brackets

In [37]:
ro16_matches = [ # Just demo :)
    ('spain', 'poland'),
    ('germany', 'england'),
    ('turkey', 'hungary'),
    ('netherlands', 'slovakia'),
    ('belgium', 'croatia'),
    ('france', 'portugal'),
    ('denmark', 'czech republic'),
    ('switzerland', 'italy')
]

In [39]:
def simulate_knockout_match(home_team, away_team, model, data_columns):
    match_data = pd.DataFrame(columns=data_columns)

    for column in data_columns:
        if f'home_team_{home_team}' in column:
            match_data.at[0, column] = 1
        elif f'away_team_{away_team}' in column:
            match_data.at[0, column] = 1
        else:
            match_data.at[0, column] = 0

    prediction = model.predict(match_data)
    if prediction == 1:
        return home_team
    elif prediction == -1:
        return away_team
    else:
        return home_team if model.predict_proba(match_data)[0][1] > model.predict_proba(match_data)[0][2] else away_team


ro16_results = {}
print("RO16 Results:")
for home_team, away_team in ro16_matches:
    winner = simulate_knockout_match(home_team, away_team, rf_model, X.columns)
    ro16_results[f"{home_team} vs {away_team}"] = winner
    print(f"{home_team} vs {away_team} -> Winner: {winner}")

print("\nRO16 Results Dictionary:")
print(ro16_results)

RO16 Results:
spain vs poland -> Winner: spain
germany vs england -> Winner: germany
turkey vs hungary -> Winner: hungary
netherlands vs slovakia -> Winner: netherlands
belgium vs croatia -> Winner: croatia
france vs portugal -> Winner: france
denmark vs czech republic -> Winner: denmark
switzerland vs italy -> Winner: switzerland

RO16 Results Dictionary:
{'spain vs poland': 'spain', 'germany vs england': 'germany', 'turkey vs hungary': 'hungary', 'netherlands vs slovakia': 'netherlands', 'belgium vs croatia': 'croatia', 'france vs portugal': 'france', 'denmark vs czech republic': 'denmark', 'switzerland vs italy': 'switzerland'}


In [40]:
ro16_winners = list(ro16_results.values())
qf_matches = [
    (ro16_winners[3], ro16_winners[1]),
    (ro16_winners[5], ro16_winners[4]),
    (ro16_winners[6], ro16_winners[7]),
    (ro16_winners[2], ro16_winners[0])
]


qf_results = {}
print("\nQF Results:")
for home_team, away_team in qf_matches:
    winner = simulate_knockout_match(home_team, away_team, rf_model, X.columns)
    qf_results[f"{home_team} vs {away_team}"] = winner
    print(f"{home_team} vs {away_team} -> Winner: {winner}")


QF Results:
netherlands vs germany -> Winner: netherlands
france vs croatia -> Winner: france
denmark vs switzerland -> Winner: denmark
hungary vs spain -> Winner: spain


In [41]:
qf_winners = list(qf_results.values())
SF_matches = [
    (qf_winners[0], qf_winners[1]),
    (qf_winners[3], qf_winners[2])
]


SF_results = {}
print("\nSFResults:")
for home_team, away_team in SF_matches:
    winner = simulate_knockout_match(home_team, away_team, rf_model, X.columns)
    SF_results[f"{home_team} vs {away_team}"] = winner
    print(f"{home_team} vs {away_team} -> Winner: {winner}")


SFResults:
netherlands vs france -> Winner: france
spain vs denmark -> Winner: spain


In [42]:
SF_winners = list(SF_results.values())
finale_match = (SF_winners[0], SF_winners[1])

print("\nFinale Result:")
finale_winner = simulate_knockout_match(finale_match[0], finale_match[1], rf_model, X.columns)
print(f"{finale_match[0]} vs {finale_match[1]} -> Winner: {finale_winner}")

print("\nFinale Winner:")
print(finale_winner)


Finale Result:
france vs spain -> Winner: spain

Finale Winner:
spain
