# Euro 2024 Analysis

In [37]:
import pandas as pd
import itertools

# Load datasets
euro2020_top_players = pd.read_csv('euro2020-top-players.csv')
all_matches = pd.read_csv('all_matches.csv')
national_teams_appearance = pd.read_csv('national_teams_appearance.csv')
participated_teams_stats = pd.read_csv('participated_teams_stats.csv')
top_goal_scorers = pd.read_csv('top_goal_scorers.csv')

# Strip whitespace from team names
participated_teams_stats['Team'] = participated_teams_stats['Team'].str.strip()
all_matches['HomeTeamName'] = all_matches['HomeTeamName'].str.strip()
all_matches['AwayTeamName'] = all_matches['AwayTeamName'].str.strip()

# Print unique teams to check for mismatches
print("Teams in participated_teams_stats:",
      participated_teams_stats['Team'].unique())
print("Teams in all_matches:", all_matches['HomeTeamName'].unique())

Teams in participated_teams_stats: ['Germany' 'France' 'Spain' 'Italy' 'Portugal' 'Netherlands'
 'Czech Republic' 'Russia' 'England' 'Croatia' 'Denmark' 'Belgium'
 'Sweden' 'Greece' 'Turkey' 'Wales' 'Poland' 'Switzerland' 'Serbia'
 'Iceland' 'Hungary' 'Republic of Ireland' 'Romania' 'Scotland' 'Norway'
 'Slovakia' 'Bulgaria' 'Albania' 'Northern Ireland' 'Ukraine' 'Slovenia'
 'Austria' 'Latvia']
Teams in all_matches: ['France' 'Czechoslovakia' 'Soviet Union' 'Spain' 'Denmark' 'Hungary'
 'Italy' 'Yugoslavia' 'England' 'Belgium' 'West Germany' 'Netherlands'
 'Greece' 'Romania' 'Portugal' 'Republic of Ireland' 'Sweden' 'CIS'
 'Scotland' 'Switzerland' 'Bulgaria' 'Germany' 'Czech Republic' 'Russia'
 'Turkey' 'Croatia' 'FR Yugoslavia' 'Slovenia' 'Norway' 'Latvia' 'Austria'
 'Poland' 'Ukraine' 'Albania' 'Wales' 'Slovakia' 'Northern Ireland'
 'Iceland']


In [38]:
# Define the groups based on the updated images
groups = {
    'Group A': ['Germany', 'Switzerland', 'Scotland', 'Hungary'],
    'Group B': ['Spain', 'Italy', 'Croatia', 'Albania'],
    'Group C': ['England', 'Slovenia', 'Serbia', 'Denmark'],
    'Group D': ['Netherlands', 'France', 'Austria', 'Poland'],
    'Group E': ['Belgium', 'Ukraine', 'Slovakia', 'Romania'],
    'Group F': ['Portugal', 'Czech Republic', 'Georgia', 'Turkey']
}

# Generate all possible matches for each group (each team plays every other team 3 times)
matches = []
for group, teams in groups.items():
    for home_team, away_team in itertools.permutations(teams, 2):
        matches.append([home_team, away_team])
        matches.append([home_team, away_team])
        matches.append([home_team, away_team])

group_stage_matches = pd.DataFrame(
    matches, columns=['HomeTeamName', 'AwayTeamName'])

# Check for any mismatches
mismatched_teams = set(group_stage_matches['HomeTeamName'].unique()).difference(
    set(participated_teams_stats['Team'].unique()))
print("Mismatched teams:", mismatched_teams)

Mismatched teams: {'Georgia'}


In [39]:
# Calculate overall average performance metrics
default_avg_goals_for = participated_teams_stats['Goal_For'].sum(
) / participated_teams_stats['Played'].sum()
default_avg_goals_against = participated_teams_stats['Goal_Against'].sum(
) / participated_teams_stats['Played'].sum()
default_win_percentage = (participated_teams_stats['Win'].sum(
) / participated_teams_stats['Played'].sum()) * 100

# Assign default values for teams with no historical data (e.g., Georgia)
group_stage_matches['Avg_Goals_For_Home'] = group_stage_matches.apply(
    lambda row: default_avg_goals_for if row['HomeTeamName'] == 'Georgia' else participated_teams_stats.loc[participated_teams_stats['Team'] == row['HomeTeamName'], 'Goal_For'].sum() / participated_teams_stats.loc[participated_teams_stats['Team'] == row['HomeTeamName'], 'Played'].sum(), axis=1)
group_stage_matches['Avg_Goals_Against_Home'] = group_stage_matches.apply(
    lambda row: default_avg_goals_against if row['HomeTeamName'] == 'Georgia' else participated_teams_stats.loc[participated_teams_stats['Team'] == row['HomeTeamName'], 'Goal_Against'].sum() / participated_teams_stats.loc[participated_teams_stats['Team'] == row['HomeTeamName'], 'Played'].sum(), axis=1)
group_stage_matches['Win_Percentage_Home'] = group_stage_matches.apply(
    lambda row: default_win_percentage if row['HomeTeamName'] == 'Georgia' else (participated_teams_stats.loc[participated_teams_stats['Team'] == row['HomeTeamName'], 'Win'].sum() / participated_teams_stats.loc[participated_teams_stats['Team'] == row['HomeTeamName'], 'Played'].sum()) * 100, axis=1)

group_stage_matches['Avg_Goals_For_Away'] = group_stage_matches.apply(
    lambda row: default_avg_goals_for if row['AwayTeamName'] == 'Georgia' else participated_teams_stats.loc[participated_teams_stats['Team'] == row['AwayTeamName'], 'Goal_For'].sum() / participated_teams_stats.loc[participated_teams_stats['Team'] == row['AwayTeamName'], 'Played'].sum(), axis=1)
group_stage_matches['Avg_Goals_Against_Away'] = group_stage_matches.apply(
    lambda row: default_avg_goals_against if row['AwayTeamName'] == 'Georgia' else participated_teams_stats.loc[participated_teams_stats['Team'] == row['AwayTeamName'], 'Goal_Against'].sum() / participated_teams_stats.loc[participated_teams_stats['Team'] == row['AwayTeamName'], 'Played'].sum(), axis=1)
group_stage_matches['Win_Percentage_Away'] = group_stage_matches.apply(
    lambda row: default_win_percentage if row['AwayTeamName'] == 'Georgia' else (participated_teams_stats.loc[participated_teams_stats['Team'] == row['AwayTeamName'], 'Win'].sum() / participated_teams_stats.loc[participated_teams_stats['Team'] == row['AwayTeamName'], 'Played'].sum()) * 100, axis=1)

print(group_stage_matches.head())

  HomeTeamName AwayTeamName  Avg_Goals_For_Home  Avg_Goals_Against_Home  \
0      Germany  Switzerland            1.469388                0.979592   
1      Germany  Switzerland            1.469388                0.979592   
2      Germany  Switzerland            1.469388                0.979592   
3      Germany     Scotland            1.469388                0.979592   
4      Germany     Scotland            1.469388                0.979592   

   Win_Percentage_Home  Avg_Goals_For_Away  Avg_Goals_Against_Away  \
0            53.061224            0.615385                1.153846   
1            53.061224            0.615385                1.153846   
2            53.061224            0.615385                1.153846   
3            53.061224            0.666667                0.833333   
4            53.061224            0.666667                0.833333   

   Win_Percentage_Away  
0            15.384615  
1            15.384615  
2            15.384615  
3            33.333333  
4  

In [41]:
# Ensure the team names are consistent
all_matches['HomeTeamName'] = all_matches['HomeTeamName'].str.strip()
all_matches['AwayTeamName'] = all_matches['AwayTeamName'].str.strip()

# Calculate average goals for and against
participated_teams_stats['Avg_Goals_For'] = participated_teams_stats['Goal_For'] / \
    participated_teams_stats['Played']
participated_teams_stats['Avg_Goals_Against'] = participated_teams_stats['Goal_Against'] / \
    participated_teams_stats['Played']
participated_teams_stats['Win_Percentage'] = participated_teams_stats['Win'] / \
    participated_teams_stats['Played'] * 100

# Merge the team stats with match data
all_matches = all_matches.merge(participated_teams_stats[['Team', 'Avg_Goals_For', 'Avg_Goals_Against', 'Win_Percentage']],
                                left_on='HomeTeamName', right_on='Team', how='left')
all_matches = all_matches.merge(participated_teams_stats[['Team', 'Avg_Goals_For', 'Avg_Goals_Against', 'Win_Percentage']],
                                left_on='AwayTeamName', right_on='Team', how='left', suffixes=('_Home', '_Away'))

# Select features for modeling
features = ['Avg_Goals_For_Home', 'Avg_Goals_Against_Home', 'Win_Percentage_Home',
            'Avg_Goals_For_Away', 'Avg_Goals_Against_Away', 'Win_Percentage_Away']
X = all_matches[features]

# Create target variables
y_home_goals = all_matches['HomeTeamGoals']
y_away_goals = all_matches['AwayTeamGoals']
y_outcome = all_matches.apply(lambda row: 'HomeWin' if row['HomeTeamGoals'] > row['AwayTeamGoals'] else (
    'AwayWin' if row['HomeTeamGoals'] < row['AwayTeamGoals'] else 'Draw'), axis=1)

# Ensure lengths match
print("Length of X:", len(X))
print("Length of y_home_goals:", len(y_home_goals))
print("Length of y_away_goals:", len(y_away_goals))
print("Length of y_outcome:", len(y_outcome))

Length of X: 286
Length of y_home_goals: 286
Length of y_away_goals: 286
Length of y_outcome: 286


#### Split the data

In [42]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train_home_goals, y_test_home_goals = train_test_split(
    X, y_home_goals, test_size=0.2, random_state=42)
X_train, X_test, y_train_away_goals, y_test_away_goals = train_test_split(
    X, y_away_goals, test_size=0.2, random_state=42)
X_train, X_test, y_train_outcome, y_test_outcome = train_test_split(
    X, y_outcome, test_size=0.2, random_state=42)

#### Model Building

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the models
home_goals_model = LinearRegression()
home_goals_model.fit(X_train, y_train_home_goals)

away_goals_model = LinearRegression()
away_goals_model.fit(X_train, y_train_away_goals)

outcome_model = RandomForestClassifier(n_estimators=100, random_state=42)
outcome_model.fit(X_train, y_train_outcome)

# Predict and evaluate the models
y_pred_home_goals = home_goals_model.predict(X_test)
y_pred_away_goals = away_goals_model.predict(X_test)
y_pred_outcome = outcome_model.predict(X_test)

print("Outcome Prediction Accuracy:",
      accuracy_score(y_test_outcome, y_pred_outcome))
print("Outcome Prediction Report:\n",
      classification_report(y_test_outcome, y_pred_outcome))