<div style="text-align: center;" >
<h1 style="margin-top: 0.2em; margin-bottom: 0.1em;">SMDA-Project Simulation</h1>
<h4 style="margin-top: 0.7em; margin-bottom: 0.3em; font-style:italic">


</div>
<br>

## Import Libraries & Data

In [1]:
#Import libraries
import pandas as pd
import random

In [2]:
#Read the data from local .csv file
performance_scores = pd.read_csv("data/nations_performance.csv")
performance_scores

Unnamed: 0,Team,FIFA23_Score,GPT_Score,Gemini_Score,Llama3_Score
0,Germany,0.785059,0.85,0.8,0.85
1,England,0.864526,0.9,0.75,0.86
2,France,0.754616,0.95,0.85,0.85
3,Italy,0.759933,0.8,0.7,0.82
4,Portugal,0.437928,0.85,0.8,0.81
5,Spain,0.883061,0.9,0.8,0.85
6,Belgium,0.667914,0.75,0.8,0.82
7,Netherlands,0.72137,0.8,0.7,0.73
8,Croatia,0.553762,0.7,0.7,0.62
9,Denmark,0.56587,0.65,0.6,0.72


## Function to simulate the group-stage

The following function simulates the group stage of EURO24. It determines the outcome of each match by weighting the performance scores of the countries, which influences the number of goals scored. Just like in the real tournament, winners receive 3 points, both teams receive 1 point in the event of a draw, and losers receive no points. Additionally, the function tracks goals scored, goals conceded, and the resulting goal difference, which are saved in the resulting table.

In [3]:
def simulate_group_matches(score_type, df_teams):
    """
    Simulates group stage matches for a football tournament based on given team scores.

    Parameters:
    score_type (str): The column name in the DataFrame that contains the team scores.
    df_teams (pd.DataFrame): A DataFrame containing the team names and their scores.

    Returns:
    dict: A dictionary containing the results for each group. Each key is a group identifier (e.g., "A", "B"),
          and each value is a DataFrame with the match results for that group. The DataFrame includes columns for:
          - 'Team': The team name.
          - 'Points': The total points accumulated by the team.
          - 'Goals For': The total number of goals scored by the team.
          - 'Goals Against': The total number of goals conceded by the team.
          - 'Goal Difference': The difference between goals scored and goals conceded.
    """
    
    #Define the match schedule for each group
    groups = {
        "A": [["Germany", "Scotland"], ["Hungary", "Switzerland"], ["Germany", "Hungary"], ["Scotland", "Switzerland"], ["Germany", "Switzerland"], ["Hungary", "Scotland"]],
        "B": [["Spain", "Croatia"], ["Italy", "Albania"], ["Spain", "Italy"], ["Croatia", "Albania"], ["Spain", "Albania"], ["Italy", "Croatia"]],
        "C": [["Slovenia", "Denmark"], ["Serbia", "England"], ["Slovenia", "Serbia"], ["Denmark", "England"], ["Slovenia", "England"], ["Denmark", "Serbia"]],
        "D": [["Poland", "Netherlands"], ["Austria", "France"], ["Poland", "Austria"], ["Netherlands", "France"], ["Poland", "France"], ["Netherlands", "Austria"]],
        "E": [["Belgium", "Slovakia"], ["Romania", "Ukraine"], ["Belgium", "Romania"], ["Slovakia", "Ukraine"], ["Belgium", "Ukraine"], ["Slovakia", "Romania"]],
        "F": [["Türkiye", "Georgia"], ["Portugal", "Czech Republic"], ["Türkiye", "Portugal"], ["Georgia", "Czech Republic"], ["Georgia", "Portugal"], ["Türkiye", "Czech Republic"]]
    }
    
    #Convert team scores to a dictionary for easy access
    teams_scores = df_teams.set_index("Team")[score_type].to_dict()

    #Initialize a dictionary to store the results for each group
    group_results = {}
    
    #Loop through each group and simulate matches
    for group, matches in groups.items():
        #Initialize a dictionary to store match results
        results = {
            "Team": [],
            "Points": [],
            "Goals For": [],
            "Goals Against": [],
            "Goal Difference": []
        }

        #Identify unique teams in the group and initialize their scores
        group_teams = set([team for match in matches for team in match])
        for team in group_teams:
            results["Team"].append(team)
            results["Points"].append(0)
            results["Goals For"].append(0)
            results["Goals Against"].append(0)
            results["Goal Difference"].append(0)

        #Create a DataFrame to store the results
        df_results = pd.DataFrame(results)
        df_results.set_index("Team", inplace=True)

        #Loop through each match and simulate the outcome
        for match in matches:
            team1, team2 = match
            score1 = teams_scores[team1]
            score2 = teams_scores[team2]

            #Normalize scores to create weights for goal probabilities
            weight_team1 = 1 - score1
            weight_team2 = 1 - score2

            #Generate goal probabilities based on weights
            weights_team1 = [weight_team1, (1 - weight_team1) / 4, (1 - weight_team1) / 4, (1 - weight_team1) / 4, (1 - weight_team1) / 4]
            weights_team2 = [weight_team2, (1 - weight_team2) / 4, (1 - weight_team2) / 4, (1 - weight_team2) / 4, (1 - weight_team2) / 4]

            #Simulate goals for both teams
            goals_team1 = random.choices(range(5), k=1, weights=weights_team1)[0]
            goals_team2 = random.choices(range(5), k=1, weights=weights_team2)[0]

            #Update the results for both teams
            df_results.at[team1, "Goals For"] += goals_team1
            df_results.at[team1, "Goals Against"] += goals_team2
            df_results.at[team2, "Goals For"] += goals_team2
            df_results.at[team2, "Goals Against"] += goals_team1

            #Update points based on match outcome
            if goals_team1 > goals_team2:
                df_results.at[team1, "Points"] += 3
            elif goals_team1 < goals_team2:
                df_results.at[team2, "Points"] += 3
            else:
                df_results.at[team1, "Points"] += 1
                df_results.at[team2, "Points"] += 1

        #Calculate goal difference for each team
        df_results["Goal Difference"] = df_results["Goals For"] - df_results["Goals Against"]
        
        #Sort teams by points, goal difference, and goals for
        df_results.sort_values(by=["Points", "Goal Difference", "Goals For"], ascending=False, inplace=True)

        #Store the results for the current group
        group_results[group] = df_results

    #Return the results of all groups
    return group_results

## Function to simulate the final-stages

Similar to the group stage, the final round matches are simulated based on weighted performance scores. Special attention is given to the knockout round rules. For instance, if a match ends in a draw during regular time, extra time is played. If the draw persists, a penalty shoot-out is simulated. Unlike the group stage, no points are awarded. Instead, the winner advances to the next round of the tournament.

In [4]:
def simulate_final_matches(score_type, df_teams, stage):
    """
    Simulates the final stage matches of a football tournament, including knockout rounds, extra time, and penalties.

    Parameters:
    score_type (str): The column name in the DataFrame that contains the team scores.
    df_teams (pd.DataFrame): A DataFrame containing the team names and their scores.
    stage (dict): A dictionary containing the match schedule for the final stages. Each key is the round name (e.g., "Quarter-Finals"),
                  and each value is a list of matches. Each match is represented as a list with two team names.

    Returns:
    list: A list of dictionaries, each containing the results of a match. Each dictionary includes:
          - 'Match': A string representing the match (e.g., "Team1 vs Team2").
          - 'Goals Team 1': The number of goals scored by the first team.
          - 'Goals Team 2': The number of goals scored by the second team.
          - 'Winner': The name of the winning team.
    """
    
    #Convert team scores to a dictionary for easy access
    teams_scores = df_teams.set_index("Team")[score_type].to_dict()
    
    #Initialize a list to store the results of each match
    results = []
    
    #Loop through each round and simulate the matches
    for round_name, matches in stage.items():
        for match in matches:
            team1, team2 = match
            score1 = teams_scores[team1]
            score2 = teams_scores[team2]

            #Normalize scores to create weights for goal probabilities
            weight_team1 = 1 - score1
            weight_team2 = 1 - score2

            #Generate goal probabilities based on weights
            weights_team1 = [weight_team1, (1 - weight_team1) / 4, (1 - weight_team1) / 4, (1 - weight_team1) / 4, (1 - weight_team1) / 4]
            weights_team2 = [weight_team2, (1 - weight_team2) / 4, (1 - weight_team2) / 4, (1 - weight_team2) / 4, (1 - weight_team2) / 4]

            #Simulate goals for both teams
            goals_team1 = random.choices(range(5), k=1, weights=weights_team1)[0]
            goals_team2 = random.choices(range(5), k=1, weights=weights_team2)[0]

            #If the match ends in a draw, simulate extra time and possibly penalties
            if goals_team1 == goals_team2:
                #Simulate extra time goals
                extra_time_team1 = random.choices(range(3), k=1)[0]
                extra_time_team2 = random.choices(range(3), k=1)[0]
                goals_team1 += extra_time_team1
                goals_team2 += extra_time_team2

                #If still a draw after extra time, simulate penalties
                if goals_team1 == goals_team2:
                    penalties_team1 = random.choices(range(5, 11), k=1)[0]
                    penalties_team2 = random.choices(range(5, 11), k=1)[0]
                    while penalties_team1 == penalties_team2:
                        penalties_team1 += random.choice([0, 1])
                        penalties_team2 += random.choice([0, 1])
                    if penalties_team1 > penalties_team2:
                        winner = team1
                    else:
                        winner = team2
                else:
                    winner = team1 if goals_team1 > goals_team2 else team2
            else:
                winner = team1 if goals_team1 > goals_team2 else team2

            #Record the match result
            match_result = {
                "Match": f"{team1} vs {team2}",
                "Goals Team 1": goals_team1,
                "Goals Team 2": goals_team2,
                "Winner": winner
            }
            
            #Append the result to the results list
            results.append(match_result)

    #Return the final results of all matches
    return results

## Function to simulate the whole tournament

The function to simulate the entire tournament multiple times is composed of the previously described functions. It begins by simulating the group phase. To determine the matches for the round of 16, FIFA allocation rules are followed: in addition to the 1st and 2nd place finishers, the four best 3rd place finishers are included. For the final stages, winners of each match advance, simulating from the round of 16 to the final.

For each stage reached in the final round, a counter is incremented by +1 to track how often each team reaches each stage. These counts are then converted into percentages and presented as a table.

In [5]:
def simulate_tournament(score_type, df_teams, n):
    """
    Simulates a football tournament n times and calculates the percentage of times each team reaches various stages of the tournament.

    Parameters:
    score_type (str): The column name in the DataFrame that contains the team scores.
    df_teams (pd.DataFrame): A DataFrame containing the team names and their scores.
    n (int): The number of times the tournament should be simulated.

    Returns:
    pd.DataFrame: A DataFrame with the percentage of times each team reaches the Round of 16, Quarter Finals, Semi Finals, Final, and wins the tournament.
    """
    
    #List of countries participating in the tournament
    countries = ["Germany", "England", "France", "Italy", "Portugal", "Spain", "Belgium", "Netherlands", 
                 "Croatia", "Denmark", "Poland", "Austria", "Czech Republic", "Scotland", "Ukraine", 
                 "Hungary", "Romania", "Switzerland", "Slovenia", "Türkiye", "Serbia", "Georgia", 
                 "Slovakia", "Albania"]

    #Initialize the DataFrame with zeros for each stage of the tournament
    tournament_results = pd.DataFrame({
        "Country": countries,
        "Round of 16": [0] * len(countries),
        "Quarter Final": [0] * len(countries),
        "Semi Final": [0] * len(countries),
        "Final": [0] * len(countries),
        "Winner": [0] * len(countries)
    })

    #Simulate the tournament n times and update the tournament results
    for tournament in range(n):
        #Simulate the group stage matches and get the results
        group_results = simulate_group_matches(score_type, df_teams)

        #Determine the teams advancing to the Round of 16
        a1, a2 = group_results["A"].iloc[0].name, group_results["A"].iloc[1].name
        b1, b2 = group_results["B"].iloc[0].name, group_results["B"].iloc[1].name
        c1, c2 = group_results["C"].iloc[0].name, group_results["C"].iloc[1].name
        d1, d2 = group_results["D"].iloc[0].name, group_results["D"].iloc[1].name
        e1, e2 = group_results["E"].iloc[0].name, group_results["E"].iloc[1].name
        f1, f2 = group_results["F"].iloc[0].name, group_results["F"].iloc[1].name

        #Determine the best third-placed teams to advance
        third_placed_teams = []
        for group in group_results.values():
            third_placed_teams.append(group.iloc[2])

        #Sort the third-placed teams to get the top four
        third_placed_df = pd.DataFrame(third_placed_teams).sort_values(by=["Points", "Goal Difference"], ascending=[False, False])
        rd_1, rd_2, rd_3, rd_4 = third_placed_df.iloc[0].name, third_placed_df.iloc[1].name, third_placed_df.iloc[2].name, third_placed_df.iloc[3].name

        #Define the matches for the Round of 16
        round_of_16 = {"round_of_16": [[a1, c2], [b1, rd_3], [d2, e2], [f1, rd_4], [a2, b2], [c1, rd_2], [e1, rd_1], [d1, f2]]}
        round_of_16_results = simulate_final_matches(score_type, df_teams, round_of_16)

        #QUARTER FINALS
        round_of_16_winners = [result["Winner"] for result in round_of_16_results]
        quarter_finals = {"quarter_finals": [[round_of_16_winners[0], round_of_16_winners[1]], 
                                             [round_of_16_winners[2], round_of_16_winners[3]], 
                                             [round_of_16_winners[4], round_of_16_winners[5]], 
                                             [round_of_16_winners[6], round_of_16_winners[7]]]}
        quarter_finals_results = simulate_final_matches(score_type, df_teams, quarter_finals)

        #SEMI FINALS
        quarter_finals_winners = [result["Winner"] for result in quarter_finals_results]
        semi_finals = {"semi_final": [[quarter_finals_winners[0], quarter_finals_winners[1]], 
                                      [quarter_finals_winners[2], quarter_finals_winners[3]]]}
        semi_finals_results = simulate_final_matches(score_type, df_teams, semi_finals)

        #FINAL
        semi_finals_winners = [result["Winner"] for result in semi_finals_results]
        final = {"final": [[semi_finals_winners[0], semi_finals_winners[1]]]}
        final_results = simulate_final_matches(score_type, df_teams, final)

        #Update results for each stage
        for match in round_of_16["round_of_16"]:
            for country in match:
                tournament_results.loc[tournament_results["Country"] == country, "Round of 16"] += 1
        for result in round_of_16_results:
            tournament_results.loc[tournament_results["Country"] == result["Winner"], "Quarter Final"] += 1
        for result in quarter_finals_results:
            tournament_results.loc[tournament_results["Country"] == result["Winner"], "Semi Final"] += 1
        for result in semi_finals_results:
            tournament_results.loc[tournament_results["Country"] == result["Winner"], "Final"] += 1

        #Update the winner count
        tournament_results.loc[tournament_results["Country"] == final_results[0]["Winner"], "Winner"] += 1

    #Convert the counts to percentages
    for column in ["Round of 16", "Quarter Final", "Semi Final", "Final", "Winner"]:
        tournament_results[column] = (tournament_results[column] / n) * 100

    #Sort the results by the stages of the tournament in descending order
    tournament_results = tournament_results.sort_values(by=["Winner", "Final", "Semi Final", "Quarter Final", "Round of 16"], ascending=False)
    
    #Reset the index of the DataFrame
    tournament_results.reset_index(drop=True, inplace=True)
    
    #Return the results of the tournament
    return tournament_results


The number of simulations (10.000) was chosen because it offers a trade-off between accuracy of results, computer resources and time.

In [6]:
#Simulate the tournament with FIFA23_Score
fifa23_results = simulate_tournament(score_type = "FIFA23_Score", df_teams = performance_scores, n = 10000)

#Save results in local directory
fifa23_results.to_csv("data/fifa23_results.csv", index=False)

#Display results
fifa23_results

Unnamed: 0,Country,Round of 16,Quarter Final,Semi Final,Final,Winner
0,Spain,92.67,60.49,37.78,23.71,14.33
1,England,90.75,57.86,33.86,21.64,12.81
2,Germany,83.26,47.73,26.39,15.2,8.51
3,France,78.68,47.72,27.54,14.62,7.89
4,Italy,86.15,49.76,27.96,15.44,7.86
5,Czech Republic,87.17,49.4,27.07,13.53,7.11
6,Netherlands,76.25,43.94,24.12,12.21,6.25
7,Belgium,81.49,42.55,22.44,10.6,5.16
8,Ukraine,79.88,40.88,21.34,10.06,4.78
9,Hungary,68.8,33.01,15.0,7.04,3.14


In [7]:
#Simulate the tournament with GPT_Score
chatgpt_results = simulate_tournament(score_type = "GPT_Score", df_teams = performance_scores, n = 10000)

#Save results in local directory
chatgpt_results.to_csv("data/chatgpt_results.csv", index=False)

#Display results
chatgpt_results

In [None]:
#Simulate the tournament with Gemini_Score
gemini_results = simulate_tournament(score_type = "Gemini_Score", df_teams = performance_scores, n = 10000)

#Save results in local directory
gemini_results.to_csv("data/gemini_results.csv", index=False)

#Display results
gemini_results

Unnamed: 0,Country,Round of 16,Quarter Final,Semi Final,Final,Winner
0,France,84.03,53.53,32.22,18.9,11.3
1,Germany,87.3,50.66,28.91,16.95,9.63
2,Belgium,87.78,52.06,29.78,17.06,9.48
3,Portugal,87.46,50.95,29.56,16.56,9.33
4,Spain,81.51,49.12,28.6,15.72,8.61
5,England,82.19,46.23,25.17,13.7,7.2
6,Italy,73.97,40.14,21.28,11.06,5.49
7,Netherlands,72.44,40.02,21.32,11.13,5.46
8,Croatia,74.5,41.0,21.65,11.01,5.41
9,Serbia,69.09,33.13,15.57,7.66,3.41


In [None]:
#Simulate the tournament with Llama3_Score
llama3_results = simulate_tournament(score_type = "Llama3_Score", df_teams = performance_scores, n = 10000)

#Save results in local directory
llama3_results.to_csv("data/llama3_results.csv", index=False)

#Display results
llama3_results

Unnamed: 0,Country,Round of 16,Quarter Final,Semi Final,Final,Winner
0,England,89.15,54.3,31.56,18.41,10.24
1,France,85.99,52.16,30.98,17.17,9.98
2,Spain,85.93,52.95,29.74,17.08,9.73
3,Germany,87.58,49.84,29.17,16.58,9.15
4,Italy,83.25,49.2,27.73,15.48,8.59
5,Belgium,84.09,49.35,28.01,15.37,8.12
6,Portugal,84.29,48.84,27.75,14.7,8.09
7,Denmark,79.98,40.97,20.79,10.68,5.26
8,Netherlands,78.16,43.26,22.71,11.07,5.14
9,Serbia,74.05,36.02,16.47,7.83,3.71
