In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load datasets
df_train = pd.read_csv("EPL_assignment/epl_matches_train.csv")
df_test = pd.read_csv("EPL_assignment/epl_matches_test.csv")  # No merging/modifications allowed
df_teams = pd.read_csv("EPL_assignment/epl_teams.csv")
df_players = pd.read_csv("EPL_assignment/epl_players.csv")
df_shots = pd.read_csv("EPL_assignment/epl_potential_shots.csv")
df_goals = pd.read_csv("EPL_assignment/epl_goals.csv")

# Convert dates
df_teams["date"] = pd.to_datetime(df_teams["date"])
df_players["date"] = pd.to_datetime(df_players["date"])

# -------------------------
# Team Feature Engineering
# -------------------------

# Create target variable 'match_result'
def get_match_result(home_goals, away_goals):
    if home_goals > away_goals:
        return "Win"
    elif home_goals < away_goals:
        return "Lose"
    else:
        return "Draw"

df_train["match_result"] = df_train.apply(lambda x: get_match_result(x["home_team_goal"], x["away_team_goal"]), axis=1)

# Aggregate past team performance (averages for model training)

# Select only numeric columns for aggregation
numeric_columns = df_teams.select_dtypes(include='number').columns

# Group by team_id and aggregate on numeric columns
team_stats = df_teams[numeric_columns].groupby('team_id').mean().reset_index()

# Merge historical team performance with training data
df_train = df_train.merge(team_stats, left_on="home_team_id", right_on="team_id", suffixes=("", "_home"))
df_train = df_train.merge(team_stats, left_on="away_team_id", right_on="team_id", suffixes=("", "_away"))

# Drop unnecessary team_id columns
df_train.drop(columns=["team_id", "team_id_away"], inplace=True)

# -------------------------
# Player Feature Engineering
# -------------------------
player_stats = ["dribbling", "finishing", "short_passing", "ball_control", "acceleration", "stamina"]
df_players_avg = df_players.groupby("player_id")[player_stats].mean().reset_index()

def compute_team_player_avg(df, prefix):
    player_cols = [f"{prefix}_player_{i}" for i in range(1, 12)]
    return df[player_cols].apply(lambda x: df_players_avg[df_players_avg["player_id"].isin(x)].mean(), axis=1)[player_stats]

df_train[player_stats] = compute_team_player_avg(df_train, "home")
df_train[[f"{stat}_away" for stat in player_stats]] = compute_team_player_avg(df_train, "away")

# Drop player ID columns (since they are categorical)
df_train.drop(columns=[f"home_player_{i}" for i in range(1, 12)] + [f"away_player_{i}" for i in range(1, 12)], inplace=True)

# -------------------------
# Shots & Goals Data
# -------------------------
# Compute historical average shots per team
df_shots_agg = df_shots.groupby("team_id")["shot_number"].count().reset_index()
df_shots_agg.rename(columns={"shot_number": "avg_shots_per_game"}, inplace=True)

# Compute historical goals per team
df_goals_agg = df_goals.groupby("team_id")["goal_number"].count().reset_index()
df_goals_agg.rename(columns={"goal_number": "avg_goals_per_game"}, inplace=True)

# Merge shots & goals with team stats
team_performance = team_stats.merge(df_shots_agg, on="team_id", how="left")
team_performance = team_performance.merge(df_goals_agg, on="team_id", how="left")
team_performance.fillna(0, inplace=True)

# Merge past shots & goals into training data
df_train = df_train.merge(team_performance, left_on="home_team_id", right_on="team_id", suffixes=("", "_home"))
df_train = df_train.merge(team_performance, left_on="away_team_id", right_on="team_id", suffixes=("", "_away"))
df_train.drop(columns=["team_id", "team_id_away"], inplace=True)

# -------------------------
# Train Model
# -------------------------
label_encoder = LabelEncoder()
df_train["match_result"] = label_encoder.fit_transform(df_train["match_result"])

# This turns "Win" into 2, "Lose" into 1, and "Draw" into 0

print(df_train.head())
print(df_train.columns)
print(df_train["match_result"])

X = df_train.drop(columns=["match_result", "season", "date"])
y = df_train["match_result"]


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# -------------------------
# Predict on df_test (WITHOUT MERGING)
# -------------------------
X_test = df_test.copy()

print(X_test.head())

# Replace team_id with their past avg performance
X_test = X_test.merge(team_performance, left_on="home_team_id", right_on="team_id", how="left", suffixes=("", "_home"))
X_test = X_test.merge(team_performance, left_on="away_team_id", right_on="team_id", how="left", suffixes=("", "_away"))

X_test.drop(columns=["team_id", "team_id_away"], inplace=True)
X_test.fillna(0, inplace=True)  # Fill missing values with 0

# Predict match results
y_test_pred = clf.predict(df_test.drop(columns=["season", "date", "match_id"]))
df_test["predicted_match_result"] = label_encoder.inverse_transform(y_test_pred)

# Save results
df_test[["match_id", "home_team_id", "away_team_id", "predicted_match_result"]].to_csv("predicted_results.csv", index=False)

      season  stage                 date  match_id  home_team_id  \
0  2008/2009      1  2008-08-17 00:00:00     49337         10260   
1  2010/2011      1  2010-08-16 00:00:00     49586         10260   
2  2011/2012     13  2011-11-26 00:00:00     58063         10260   
3  2012/2013     19  2012-12-26 00:00:00     59763         10260   
4  2013/2014     15  2013-12-07 00:00:00     49022         10260   

   away_team_id  home_player_X1  home_player_X2  home_player_X3  \
0         10261               1               2               4   
1         10261               1               2               4   
2         10261               1               2               4   
3         10261               1               2               4   
4         10261               1               2               4   

   home_player_X4  ...  buildUpPlayDribbling_away  buildUpPlayPassing_away  \
0               6  ...                       39.0                55.333333   
1               6  ...          

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- away_player_1
- away_player_10
- away_player_11
- away_player_2
- away_player_3
- ...
Feature names seen at fit time, yet now missing:
- acceleration
- acceleration_away
- avg_goals_per_game
- avg_goals_per_game_away
- avg_shots_per_game
- ...
