In [14]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load datasets
df_train = pd.read_csv("EPL_assignment/epl_matches_train.csv")
df_test = pd.read_csv("EPL_assignment/epl_matches_test.csv")
df_teams = pd.read_csv("EPL_assignment/epl_teams.csv")
df_players = pd.read_csv("EPL_assignment/epl_players.csv")
df_shots = pd.read_csv("EPL_assignment/epl_potential_shots.csv")
df_goals = pd.read_csv("EPL_assignment/epl_goals.csv")

# Ensure no missing values
df_train.dropna(inplace=True)
df_test.dropna(inplace=True)
df_teams.dropna(inplace=True)
df_players.dropna(inplace=True)
df_shots.dropna(inplace=True)
df_goals.dropna(inplace=True)

# Create target variable 'match_result'
def get_match_result(home_goals, away_goals):
    if home_goals > away_goals:
        return "Win"
    elif home_goals < away_goals:
        return "Lose"
    else:
        return "Draw"

df_train["match_result"] = df_train.apply(lambda x: get_match_result(x["home_team_goal"], x["away_team_goal"]), axis=1)

# Convert date columns to datetime format
df_teams["date"] = pd.to_datetime(df_teams["date"])
df_players["date"] = pd.to_datetime(df_players["date"])

# -------------------------
# Merge Team & Player Data
# -------------------------
df_train = df_train.merge(df_teams, left_on="home_team_id", right_on="team_id", suffixes=("", "_home"))
df_train = df_train.merge(df_teams, left_on="away_team_id", right_on="team_id", suffixes=("", "_away"))
df_test = df_test.merge(df_teams, left_on="home_team_id", right_on="team_id", suffixes=("", "_home"))
df_test = df_test.merge(df_teams, left_on="away_team_id", right_on="team_id", suffixes=("", "_away"))

# Drop redundant team_id columns
df_train.drop(columns=["team_id", "team_id_away"], inplace=True)
df_test.drop(columns=["team_id", "team_id_away"], inplace=True)

# Merge Player Data - Compute Average Skill per Team
player_stats = ["dribbling", "finishing", "short_passing", "ball_control", "acceleration", "stamina"]
df_players_avg = df_players.groupby("player_id")[player_stats].mean().reset_index()

def get_team_avg(df, prefix):
    player_columns = [f"{prefix}_player_{i}" for i in range(1, 12)]
    team_avg = df[player_columns].apply(lambda x: df_players_avg[df_players_avg["player_id"].isin(x)].mean(), axis=1)
    return team_avg[player_stats]

df_train[player_stats] = get_team_avg(df_train, "home")
df_train[[f"{stat}_away" for stat in player_stats]] = get_team_avg(df_train, "away")
df_test[player_stats] = get_team_avg(df_test, "home")
df_test[[f"{stat}_away" for stat in player_stats]] = get_team_avg(df_test, "away")

# Drop player ID columns
df_train.drop(columns=[f"home_player_{i}" for i in range(1, 12)] + [f"away_player_{i}" for i in range(1, 12)], inplace=True)
df_test.drop(columns=[f"home_player_{i}" for i in range(1, 12)] + [f"away_player_{i}" for i in range(1, 12)], inplace=True)

# -------------------------
# Merge Shots Data
# -------------------------
df_shots["shot_type"] = df_shots["shot_type"].fillna("other")  # Fill missing shot types

# Count total shots per team
df_shots_agg = df_shots.groupby(["match_id", "team_id"])["shot_number"].count().reset_index()
df_shots_agg.rename(columns={"shot_number": "total_shots"}, inplace=True)

# Pivot shot types into separate columns
df_shots_types = df_shots.pivot_table(index=["match_id", "team_id"], columns="shot_type", aggfunc="size", fill_value=0).reset_index()

# Merge shots data into training and test sets
df_train = df_train.merge(df_shots_agg, left_on=["match_id", "home_team_id"], right_on=["match_id", "team_id"], how="left").rename(columns={"total_shots": "total_shots_home"})
df_train = df_train.merge(df_shots_agg, left_on=["match_id", "away_team_id"], right_on=["match_id", "team_id"], how="left").rename(columns={"total_shots": "total_shots_away"})

df_test = df_test.merge(df_shots_agg, left_on=["match_id", "home_team_id"], right_on=["match_id", "team_id"], how="left").rename(columns={"total_shots": "total_shots_home"})
df_test = df_test.merge(df_shots_agg, left_on=["match_id", "away_team_id"], right_on=["match_id", "team_id"], how="left").rename(columns={"total_shots": "total_shots_away"})

# Merge shot types data
df_train = df_train.merge(df_shots_types, left_on=["match_id", "home_team_id"], right_on=["match_id", "team_id"], how="left")
df_test = df_test.merge(df_shots_types, left_on=["match_id", "home_team_id"], right_on=["match_id", "team_id"], how="left")

# -------------------------
# Merge Goals Data
# -------------------------
df_goals_agg = df_goals.groupby(["match_id", "team_id"])["goal_number"].count().reset_index()
df_goals_agg.rename(columns={"goal_number": "total_goals"}, inplace=True)
df_goals_agg.rename(columns={"team_id": "goal_team_id"}, inplace=True)

# Merge goals data into training and test sets, ensuring no duplication of team_id
df_train = df_train.merge(df_goals_agg, left_on=["match_id", "home_team_id"], right_on=["match_id", "goal_team_id"], how="left").rename(columns={"total_goals": "total_goals_home"})
df_train = df_train.merge(df_goals_agg, left_on=["match_id", "away_team_id"], right_on=["match_id", "goal_team_id"], how="left").rename(columns={"total_goals": "total_goals_away"})

df_test = df_test.merge(df_goals_agg, left_on=["match_id", "home_team_id"], right_on=["match_id", "goal_team_id"], how="left").rename(columns={"total_goals": "total_goals_home"})
df_test = df_test.merge(df_goals_agg, left_on=["match_id", "away_team_id"], right_on=["match_id", "goal_team_id"], how="left").rename(columns={"total_goals": "total_goals_away"})

# -------------------------
# Compute Shot Efficiency
# -------------------------
df_train["goal_efficiency_home"] = df_train["total_goals_home"] / df_train["total_shots_home"]
df_train["goal_efficiency_away"] = df_train["total_goals_away"] / df_train["total_shots_away"]
df_test["goal_efficiency_home"] = df_test["total_goals_home"] / df_test["total_shots_home"]
df_test["goal_efficiency_away"] = df_test["total_goals_away"] / df_test["total_shots_away"]

# Fill NaNs with 0
df_train.fillna(0, inplace=True)
df_test.fillna(0, inplace=True)

# -------------------------
# Train Model & Predict
# -------------------------
label_encoder = LabelEncoder()
df_train["match_result"] = label_encoder.fit_transform(df_train["match_result"])

# Select only numeric columns (int64, float64)
df_train_numeric = df_train.select_dtypes(include=['number'])
df_test_numeric = df_test.select_dtypes(include=['number'])

# Now df_train_numeric and df_test_numeric contains only numeric columns

# Get the columns from df_test
df_test_columns = df_test_numeric.columns

# Remove the 'match_result' column from the list of df_test columns
df_test_columns = df_test_columns[df_test_columns != 'match_result']

# Filter df_train to only include columns that are also in df_test
df_train_filtered = df_train_numeric[df_test_columns]

# Now df_train_filtered will have only the columns that exist in both df_train and df_test

X = df_train_filtered.drop(columns=["match_result"])
y = df_train_filtered["match_result"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_test_pred = clf.predict(df_test_numeric)

df_test_numeric["predicted_match_result"] = label_encoder.inverse_transform(y_test_pred)
df_test_numeric[["home_team_id", "away_team_id", "predicted_match_result"]].to_csv("predicted_results.csv", index=False)


KeyError: "['match_result'] not found in axis"

In [None]:
df_train_numeric

In [None]:
df_test_numeric