# Euro 2024 Analysis

#### Loading Datasets

In [None]:
import itertools
import pandas as pd

# Load datasets
euro2020_top_players = pd.read_csv('euro2020-top-players.csv')
all_matches = pd.read_csv('all_matches.csv')
national_teams_appearance = pd.read_csv('national_teams_appearance.csv')
participated_teams_stats = pd.read_csv('participated_teams_stats.csv')
top_goal_scorers = pd.read_csv('top_goal_scorers.csv')

#### Ensure Teams Are Consistent

In [None]:
participated_teams_stats['Team'] = participated_teams_stats['Team'].str.strip()
all_matches['HomeTeamName'] = all_matches['HomeTeamName'].str.strip()
all_matches['AwayTeamName'] = all_matches['AwayTeamName'].str.strip()

#### Calculate average goals for and against

In [None]:
participated_teams_stats['Avg_Goals_For'] = participated_teams_stats['Goal_For'] / \
    participated_teams_stats['Played']
participated_teams_stats['Avg_Goals_Against'] = participated_teams_stats['Goal_Against'] / \
    participated_teams_stats['Played']
participated_teams_stats['Win_Percentage'] = participated_teams_stats['Win'] / \
    participated_teams_stats['Played'] * 100

#### Merge the team stats with match data

In [None]:
all_matches = all_matches.merge(participated_teams_stats[['Team', 'Avg_Goals_For', 'Avg_Goals_Against', 'Win_Percentage']],
                                left_on='HomeTeamName', right_on='Team', how='left')
all_matches = all_matches.merge(participated_teams_stats[['Team', 'Avg_Goals_For', 'Avg_Goals_Against', 'Win_Percentage']],
                                left_on='AwayTeamName', right_on='Team', how='left', suffixes=('_Home', '_Away'))

#### Modelling

In [None]:
features = ['Avg_Goals_For_Home', 'Avg_Goals_Against_Home', 'Win_Percentage_Home',
            'Avg_Goals_For_Away', 'Avg_Goals_Against_Away', 'Win_Percentage_Away']
X = all_matches[features]

# Create target variables
y_home_goals = all_matches['HomeTeamGoals']
y_away_goals = all_matches['AwayTeamGoals']
y_outcome = all_matches.apply(lambda row: 'HomeWin' if row['HomeTeamGoals'] > row['AwayTeamGoals'] else (
    'AwayWin' if row['HomeTeamGoals'] < row['AwayTeamGoals'] else 'Draw'), axis=1)

# Handle missing values in X
default_avg_goals_for = participated_teams_stats['Avg_Goals_For'].mean()
default_avg_goals_against = participated_teams_stats['Avg_Goals_Against'].mean(
)
default_win_percentage = participated_teams_stats['Win_Percentage'].mean()

X.fillna({
    'Avg_Goals_For_Home': default_avg_goals_for,
    'Avg_Goals_Against_Home': default_avg_goals_against,
    'Win_Percentage_Home': default_win_percentage,
    'Avg_Goals_For_Away': default_avg_goals_for,
    'Avg_Goals_Against_Away': default_avg_goals_against,
    'Win_Percentage_Away': default_win_percentage
}, inplace=True)

# Verify that there are no missing values
print("Missing values in X after filling:")
print(X.isna().sum())

#### Train The Models

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train_home_goals, y_test_home_goals = train_test_split(
    X, y_home_goals, test_size=0.2, random_state=42)
X_train, X_test, y_train_away_goals, y_test_away_goals = train_test_split(
    X, y_away_goals, test_size=0.2, random_state=42)
X_train, X_test, y_train_outcome, y_test_outcome = train_test_split(
    X, y_outcome, test_size=0.2, random_state=42)

# Initialize and train the models
home_goals_model = LinearRegression()
home_goals_model.fit(X_train, y_train_home_goals)

away_goals_model = LinearRegression()
away_goals_model.fit(X_train, y_train_away_goals)

outcome_model = RandomForestClassifier(n_estimators=100, random_state=42)
outcome_model.fit(X_train, y_train_outcome)

# Predict and evaluate the models
y_pred_home_goals = home_goals_model.predict(X_test)
y_pred_away_goals = away_goals_model.predict(X_test)
y_pred_outcome = outcome_model.predict(X_test)

print("Outcome Prediction Accuracy:",
      accuracy_score(y_test_outcome, y_pred_outcome))
print("Outcome Prediction Report:\n",
      classification_report(y_test_outcome, y_pred_outcome))

### Predicting Group Stage Matches

In [None]:
# Define the groups based on the updated images
groups = {
    'Group A': ['Germany', 'Switzerland', 'Scotland', 'Hungary'],
    'Group B': ['Spain', 'Italy', 'Croatia', 'Albania'],
    'Group C': ['England', 'Slovenia', 'Serbia', 'Denmark'],
    'Group D': ['Netherlands', 'France', 'Austria', 'Poland'],
    'Group E': ['Belgium', 'Ukraine', 'Slovakia', 'Romania'],
    'Group F': ['Portugal', 'Czech Republic', 'Georgia', 'Turkey']
}

# Generate all possible matches for each group (each team plays every other team once)
matches = []
for group, teams in groups.items():
    for home_team, away_team in itertools.combinations(teams, 2):
        matches.append([home_team, away_team])

group_stage_matches = pd.DataFrame(
    matches, columns=['HomeTeamName', 'AwayTeamName'])

# Check for any mismatches
mismatched_teams = set(group_stage_matches['HomeTeamName'].unique()).difference(
    set(participated_teams_stats['Team'].unique()))
print("Mismatched teams:", mismatched_teams)

- Handle missing team data for Georgia

In [None]:
# Handle missing team data for Georgia
group_stage_matches['Avg_Goals_For_Home'] = group_stage_matches['HomeTeamName'].apply(
    lambda x: participated_teams_stats.loc[participated_teams_stats['Team'] == x, 'Avg_Goals_For'].values[0] if x != 'Georgia' else default_avg_goals_for)
group_stage_matches['Avg_Goals_Against_Home'] = group_stage_matches['HomeTeamName'].apply(
    lambda x: participated_teams_stats.loc[participated_teams_stats['Team'] == x, 'Avg_Goals_Against'].values[0] if x != 'Georgia' else default_avg_goals_against)
group_stage_matches['Win_Percentage_Home'] = group_stage_matches['HomeTeamName'].apply(
    lambda x: participated_teams_stats.loc[participated_teams_stats['Team'] == x, 'Win_Percentage'].values[0] if x != 'Georgia' else default_win_percentage)
group_stage_matches['Avg_Goals_For_Away'] = group_stage_matches['AwayTeamName'].apply(
    lambda x: participated_teams_stats.loc[participated_teams_stats['Team'] == x, 'Avg_Goals_For'].values[0] if x != 'Georgia' else default_avg_goals_for)
group_stage_matches['Avg_Goals_Against_Away'] = group_stage_matches['AwayTeamName'].apply(
    lambda x: participated_teams_stats.loc[participated_teams_stats['Team'] == x, 'Avg_Goals_Against'].values[0] if x != 'Georgia' else default_avg_goals_against)
group_stage_matches['Win_Percentage_Away'] = group_stage_matches['AwayTeamName'].apply(
    lambda x: participated_teams_stats.loc[participated_teams_stats['Team'] == x, 'Win_Percentage'].values[0] if x != 'Georgia' else default_win_percentage)

#### Predict goals for group stage matches

In [None]:
# Predict goals for group stage matches
group_stage_matches['PredictedHomeGoals'] = home_goals_model.predict(
    group_stage_matches[features]).round().astype(int)
group_stage_matches['PredictedAwayGoals'] = away_goals_model.predict(
    group_stage_matches[features]).round().astype(int)

# Ensure no negative goals
group_stage_matches['PredictedHomeGoals'] = group_stage_matches['PredictedHomeGoals'].apply(
    lambda x: max(x, 0))
group_stage_matches['PredictedAwayGoals'] = group_stage_matches['PredictedAwayGoals'].apply(
    lambda x: max(x, 0))

# Predict match outcomes
group_stage_matches['PredictedResult'] = outcome_model.predict(
    group_stage_matches[features])


#### Predicting The Scores of Every Match

In [None]:
print(group_stage_matches[['HomeTeamName',
      'AwayTeamName', 'PredictedHomeGoals', 'PredictedAwayGoals', 'PredictedResult']])

### Improving Current Model

#### Feature Engineering - Recent Form Metrics

In [None]:
# Define a function to calculate recent form metrics
def calculate_recent_form(team_name, matches, n_matches=5):
    recent_matches = matches[(matches['HomeTeamName'] == team_name) | (
        matches['AwayTeamName'] == team_name)].tail(n_matches)
    total_goals_for = 0
    total_goals_against = 0
    wins = 0

    for _, match in recent_matches.iterrows():
        if match['HomeTeamName'] == team_name:
            total_goals_for += match['HomeTeamGoals']
            total_goals_against += match['AwayTeamGoals']
            if match['HomeTeamGoals'] > match['AwayTeamGoals']:
                wins += 1
        else:
            total_goals_for += match['AwayTeamGoals']
            total_goals_against += match['HomeTeamGoals']
            if match['AwayTeamGoals'] > match['HomeTeamGoals']:
                wins += 1

    avg_goals_for = total_goals_for / n_matches
    avg_goals_against = total_goals_against / n_matches
    win_percentage = (wins / n_matches) * 100

    return avg_goals_for, avg_goals_against, win_percentage


# Apply the function to calculate recent form metrics for all teams
recent_form_metrics = []
for team in participated_teams_stats['Team'].unique():
    avg_goals_for, avg_goals_against, win_percentage = calculate_recent_form(
        team, all_matches)
    recent_form_metrics.append({
        'Team': team,
        'Recent_Avg_Goals_For': avg_goals_for,
        'Recent_Avg_Goals_Against': avg_goals_against,
        'Recent_Win_Percentage': win_percentage
    })

recent_form_metrics_df = pd.DataFrame(recent_form_metrics)
print(recent_form_metrics_df.head())

Was I Correct?