# Module 1: Sports
#### Group 7

### AI Declaration
We leveraged ChatGPT to help us with library methods and error metrics:
https://chatgpt.com/c/69723b60-befc-8328-9621-8985766b82b2

In [None]:
import pandas as pd
import numpy as np
import itertools

from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

### 1) Load and Pre-process Data


In [None]:
PLAYER_PATH = "data/player_data.csv"
STINT_PATH  = "data/stint_data.csv"

player_df = pd.read_csv(PLAYER_PATH)
stint_df  = pd.read_csv(STINT_PATH)

## Normalize the parameters in case of incorrect naming/type conventions
# Normalize column names
player_df.columns = player_df.columns.str.strip().str.lower()
stint_df.columns  = stint_df.columns.str.strip().str.lower()

# Normalize team strings
stint_df["h_team"] = stint_df["h_team"].astype(str).str.strip()
stint_df["a_team"] = stint_df["a_team"].astype(str).str.strip()

# Normalize rating to numeric values, not strings
player_df["rating"] = pd.to_numeric(player_df["rating"], errors="coerce")

# map the player to their respective rating
rating_map = player_df.set_index("player")["rating"].to_dict()

### 2) Stints where Canada played

In [None]:
is_can_home = stint_df["h_team"] == "Canada"
is_can_away = stint_df["a_team"] == "Canada"

canada_df = stint_df[is_can_home | is_can_away].copy()

### 3) set y = Canada's goal differential per minute

In [None]:
canada_df["can_goal_diff"] = np.where(
    canada_df["h_team"] == "Canada",
    canada_df["h_goals"] - canada_df["a_goals"],  # if Canada is home
    canada_df["a_goals"] - canada_df["h_goals"],  # if Canada is away
).astype(float)

canada_df["minutes"] = pd.to_numeric(canada_df["minutes"], errors="coerce")
canada_df["y"] = canada_df["can_goal_diff"] / canada_df["minutes"]

# clean up invalid values
canada_df = canada_df.replace([np.inf, -np.inf], np.nan)
canada_df = canada_df.dropna(subset=["y", "minutes"]) #removes rows where y, or minutes is missing
canada_df = canada_df[canada_df["minutes"] > 0].copy()

# weights
w = canada_df["minutes"].astype(float)
y = canada_df["y"].astype(float)

### 4) Plus/minus metric

In [None]:
home_cols = ["home1", "home2", "home3", "home4"]
away_cols = ["away1", "away2", "away3", "away4"]

def get_can_players(row):
    if row["h_team"] == "Canada":
        return [row[c] for c in home_cols]
    return [row[c] for c in away_cols]

def get_opp_players(row):
    if row["h_team"] == "Canada":
        return [row[c] for c in away_cols]
    return [row[c] for c in home_cols]

canada_df["can_players"] = canada_df.apply(get_can_players, axis=1)
canada_df["opp_players"] = canada_df.apply(get_opp_players, axis=1)

# sort lineup alphabetically
canada_df["can_lineup_key"] = canada_df["can_players"].apply(lambda xs: tuple(sorted(xs)))

In [None]:
# iterate through all the players inside of a set, preventing duplicates
all_players = sorted(
    set(p for lineup in canada_df["can_players"] for p in lineup) |
    set(p for lineup in canada_df["opp_players"] for p in lineup)
)

# create the regression matrix for our model
X = pd.DataFrame(0.0, index=canada_df.index, columns=all_players)

# iterate through each row (stint) in our game, and set Canadian players to +1, and opposition to -1. This sets up our +/- metric.
for idx, row in canada_df.iterrows():
    for p in row["can_players"]:
        X.at[idx, p] += 1.0
    for p in row["opp_players"]:
        X.at[idx, p] -= 1.0

### 5) Ridge Regression
How did we decide upon a Ridge regression model?

We use ridge regression because player values are highly collinear due to repeated lineup combinations. L2 regularization stabilizes coefficient estimates, reduces overfitting, and produces robust player value estimates for optimizing lineups.

In [None]:
alphas = [0.1, 0.5, 1, 2, 5, 10, 20, 50, 100] # from weak to strong
kf = KFold(n_splits=5, shuffle=True, random_state=42) # cross validate using 5-fold CV

best_alpha = None
best_rmse = float("inf")

# try out each alpha parameter to see which one produces the least error
for a in alphas:
    rmses = []
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        w_train, w_test = w.iloc[train_idx], w.iloc[test_idx]

        model = Ridge(alpha=a, fit_intercept=True, random_state=42)
        model.fit(X_train, y_train, sample_weight=w_train)

        pred = model.predict(X_test)
        rmse = mean_squared_error(y_test, pred, sample_weight=w_test) ** 0.5
        rmses.append(rmse)

    avg_rmse = float(np.mean(rmses))
    if avg_rmse < best_rmse:
        best_rmse = avg_rmse
        best_alpha = a

#re-run the model using the optimal alpha parameter
final_model = Ridge(alpha=best_alpha, fit_intercept=True, random_state=42)
final_model.fit(X, y, sample_weight=w)

print("Best alpha:", best_alpha)
print("CV RMSE:", best_rmse)
print("Intercept:", float(final_model.intercept_))

### 6) Canadian Players' Value

In [None]:
# create a series to map the coefficient player value to each player
coef_series = pd.Series(final_model.coef_, index=X.columns, name="coef")

# iterate through every Canada lineup, and remove dupes
canadian_players = sorted({p for lineup in canada_df["can_players"] for p in lineup})

# setup a matrix to compute how many minutes players played
X_can = pd.DataFrame(0.0, index=canada_df.index, columns=canadian_players)
for idx, lineup in canada_df["can_players"].items():
    for p in lineup:
        X_can.at[idx, p] = 1.0

player_minutes = X_can.mul(w, axis=0).sum(axis=0).rename("minutes")

player_value = pd.DataFrame({
    "coef": coef_series.reindex(canadian_players),
    "minutes": player_minutes.reindex(canadian_players),
    "rating": pd.Series({p: rating_map.get(p, np.nan) for p in canadian_players})
})

# sort by highest to lowest player value, alongside their physical rating, and total minutes they played
player_value = player_value.sort_values(["coef", "minutes"], ascending=[False, False])

print("\nTop Canadian players by model value (coef):")
print(player_value.head(50))

### 9) Best possible Canadian lineup

In [None]:
MIN_MINUTES = 0 
eligible = player_value[player_value["minutes"] >= MIN_MINUTES].dropna(subset=["rating"]).copy()

print(f"\nEligible Canadian players (minutes >= {MIN_MINUTES}): {len(eligible)}")
print(eligible[["coef", "minutes", "rating"]].head(50))

eligible_players = list(eligible.index)

best_lineup = None
best_score = -np.inf
best_rating_sum = None

# generate every possible group of 4 players from the eligible set
for lineup in itertools.combinations(eligible_players, 4):
    rating_sum = sum(eligible.loc[p, "rating"] for p in lineup) # sum of physical ability ratings
    if rating_sum <= 8:
        score = sum(eligible.loc[p, "coef"] for p in lineup) + float(final_model.intercept_)
        if score > best_score:
            best_score = score
            best_lineup = lineup
            best_rating_sum = rating_sum

print("\nBest feasible Canada lineup (4 players):")
print(best_lineup)
print("Sum of ratings:", best_rating_sum)
print("Predicted goal_diff_per_min:", best_score)


best_lineup_df = pd.DataFrame({
    "player": list(best_lineup),
    "rating": [eligible.loc[p, "rating"] for p in best_lineup],
    "coef": [eligible.loc[p, "coef"] for p in best_lineup],
    "minutes": [eligible.loc[p, "minutes"] for p in best_lineup],
}).sort_values("coef", ascending=False)

print("\nLineup details:")
print(best_lineup_df)