In [24]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GroupKFold
import xgboost as xgb

# ───────────────────────────────────────────────────────────────
# 1. Feature engineering function (your version)
# ───────────────────────────────────────────────────────────────
def add_strong_features(df):
    df['run_diff'] = df['R'] - df['RA']
    df['run_diff_per_game'] = df['run_diff'] / df['G']
    
    exp = 1.86
    df['pyth_exp'] = np.where(
        df['R'] + df['RA'] > 0,
        df['R']**exp / (df['R']**exp + df['RA']**exp),
        0.5
    )
    df['pyth_wins'] = np.round(df['G'] * df['pyth_exp']).astype(int)
    
    df['HR_diff'] = df['HR'] - df['HRA']
    df['ERA_adj'] = df['ERA'] - df['mlb_rpg'] * 9

    df['pitch_dom_low'] = df['ERA'] / (df['mlb_rpg'] + 1)

     # Per-game rates (helps across eras)
    df['R_per_game']  = df['R'] / df['G']
    df['RA_per_game'] = df['RA'] / df['G']
    
    return df

# ───────────────────────────────────────────────────────────────
# 2. Load data
# ───────────────────────────────────────────────────────────────
BASE = "input/"
data_df = pd.read_csv(os.path.join(BASE, "data.csv"))
predict_df = pd.read_csv(os.path.join(BASE, "predict.csv"))

print(f"Data set shape: {data_df.shape}")
print(f"Predict set shape: {predict_df.shape}")
print("Average wins:", data_df['W'].mean())

# Apply feature engineering to BOTH datasets
data_df   = add_strong_features(data_df)
predict_df = add_strong_features(predict_df)

# ───────────────────────────────────────────────────────────────
# 3. Define features (your current list)
# ───────────────────────────────────────────────────────────────
features = [
    'R', 'H', 'SV', 'ERA', 'CG', 'SHO',
    'IPouts','RA',
     'pyth_wins', 'run_diff_per_game',
    

    

    
    
]

features = [f for f in features if f in data_df.columns and f in predict_df.columns]
print(f"Using {len(features)} features:", features)

X = data_df[features]
y = data_df['W']

print(data_df[['decade_1910', 'decade_1920', 'decade_1930']].head(10))

# ───────────────────────────────────────────────────────────────
# 4. GroupKFold by yearID (realistic validation)
# ───────────────────────────────────────────────────────────────
groups = data_df['yearID']
gkf = GroupKFold(n_splits=5)
fold_maes = []

for fold, (train_idx, test_idx) in enumerate(gkf.split(X, y, groups=groups), 1):
    print(f"Fold {fold}:")
    
    X_train_fold = X.iloc[train_idx]
    y_train_fold = y.iloc[train_idx]
    X_test_fold  = X.iloc[test_idx]
    y_test_fold  = y.iloc[test_idx]
    
    # XGBoost does NOT need scaling — use raw features
    model = xgb.XGBRegressor(
        n_estimators           = 12000,
        learning_rate          = 0.002,
        max_depth              = 4,
        min_child_weight       = 10,
        subsample              = 0.8,
        colsample_bytree       = 0.7,
        reg_lambda             = 15.0,
        reg_alpha              = 4.0,
        gamma                  = 0.1,
        random_state           = 42,
        tree_method            = 'hist',
        eval_metric            = 'mae',
        early_stopping_rounds  = 200
    )
    


    
    model.fit(
        X_train_fold, y_train_fold,
        eval_set=[(X_test_fold, y_test_fold)],
        verbose=False
    )
    
    test_preds = model.predict(X_test_fold)
    fold_mae = mean_absolute_error(y_test_fold, test_preds)
    fold_maes.append(fold_mae)
    
    print(f"  Test MAE = {fold_mae:.4f}")
    print(f"  Best iteration = {model.best_iteration}")

print("\nGroupKFold CV MAE scores:", fold_maes)
print(f"Mean CV MAE: {np.mean(fold_maes):.4f}")
print(f"Std CV MAE:  {np.std(fold_maes):.4f}")




Data set shape: (1812, 51)
Predict set shape: (453, 45)
Average wins: 79.26158940397352
Using 10 features: ['R', 'H', 'SV', 'ERA', 'CG', 'SHO', 'IPouts', 'RA', 'pyth_wins', 'run_diff_per_game']
   decade_1910  decade_1920  decade_1930
0        False        False         True
1        False        False        False
2        False        False        False
3        False        False         True
4        False        False        False
5        False        False         True
6         True        False        False
7        False        False        False
8        False        False        False
9        False        False        False
Fold 1:
  Test MAE = 2.9109
  Best iteration = 6802
Fold 2:
  Test MAE = 2.9162
  Best iteration = 8040
Fold 3:
  Test MAE = 3.0464
  Best iteration = 2759
Fold 4:
  Test MAE = 2.9374
  Best iteration = 5487
Fold 5:
  Test MAE = 3.0131
  Best iteration = 4305

GroupKFold CV MAE scores: [2.9109459866534224, 2.916227830854874, 3.0464319410270817, 2.937437

In [32]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import QuantileRegressor
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
import lightgbm as lgb


def add_strong_features(df):
    # Core run-related (usually the strongest predictors)
    
    df['run_diff'] = df['R'] - df['RA']
    
    df['run_diff_per_game'] = df['run_diff'] / df['G']
    
    # Pythagorean (modern exponent)
    exp = 1.86
    df['pyth_exp'] = np.where(
        df['R'] + df['RA'] > 0,
        df['R']**exp / (df['R']**exp + df['RA']**exp),
        0.5
    )
    df['pyth_wins'] = np.round(df['G'] * df['pyth_exp']).astype(int)
    
   
    """
    # Per-game rates (helps across eras)
    df['R_per_game']  = df['R'] / df['G']
    df['RA_per_game'] = df['RA'] / df['G']
    
    # Pitching quality
    df['IP'] = df['IPouts'] / 3.0 + 1e-6          # avoid division by zero
    df['SOA_per_game'] = df['SOA'] / df['IP']
    df['WHIP'] = (df['HA'] + df['BBA']) / df['IP']
    """
    # Other useful differentials & adjustments
    df['HR_diff'] = df['HR'] - df['HRA']
    df['ERA_adj'] = df['ERA'] - df['mlb_rpg'] * 9     # rough league adjustment
    
    # Late-game / bullpen strength
    # df['SV_rate'] = df['SV'] / (df['SV'] + df['BS'] + 1e-6)  # if you have blown saves
    """
    # Market/team quality proxy
    #df['attendance_per_game'] = df['attendance'] / df['G']
    """
    return df

# Load the pre-processed train and test datasets
#DATAPATH = "sctpdsai-m-3-ds-3-coaching-money-ball-analytics"
BASE = f"input/"
data_df = pd.read_csv(os.path.join(BASE, "data.csv"))
predict_df = pd.read_csv(os.path.join(BASE, "predict.csv"))

# Display basic information about the datasets
print(f"Data set shape: {data_df.shape}")
print(f"Predict set shape: {predict_df.shape}")


# 3. Define features (unchanged)
# ───────────────────────────────────────────────────────────────
features = [
'G', 'R', 'AB', 'H', 'HR', 'SO', 'SB', 'SV', 'ERA', 'CG', 'SHO',
'IPouts', 'HA', 'HRA', 'BBA', 'SOA', 'E', 'DP', 'FP',
'mlb_rpg', 'pyth_wins', 'run_diff_per_game',
'era_7', 'era_8'
]

data_df   = add_strong_features(data_df)
predict_df = add_strong_features(predict_df)

# Filter features that exist in both datasets
available_features = [col for col in features if col in data_df.columns and col in predict_df.columns]
print(f"Number of available default features: {len(available_features)}")
print(available_features)

features = [f for f in features if f in data_df.columns and f in predict_df.columns]
print(f"Using {len(features)} features:", features)
X = data_df[features]
y = data_df['W']
# ───────────────────────────────────────────────────────────────
# 4. GroupKFold by yearID (the key change!)
# ───────────────────────────────────────────────────────────────
from sklearn.model_selection import GroupKFold
# Use yearID as the group → no leakage across years
groups = data_df['yearID']
gkf = GroupKFold(n_splits=5)
fold_results = []
for fold, (train_idx, test_idx) in enumerate(gkf.split(X, y, groups=groups), 1):
    print(f"Fold {fold}:")

    X_train_fold = X.iloc[train_idx]
    y_train_fold = y.iloc[train_idx]
    X_test_fold = X.iloc[test_idx]
    y_test_fold = y.iloc[test_idx]

# Scale inside each fold (correct for linear models)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_fold)
    X_test_scaled = scaler.transform(X_test_fold)

    lr = LinearRegression()
    lr.fit(X_train_scaled, y_train_fold)

# Use RidgeCV (your current best local performer)
    alphas = np.logspace(-4, 2, 30)
    ridge = RidgeCV(alphas=alphas, cv=3, scoring='neg_mean_absolute_error') # inner CV=3 to speed up
    ridge.fit(X_train_scaled, y_train_fold)

    model.fit(
        X_train_fold, y_train_fold,
        eval_set=[(X_test_fold, y_test_fold)],
        verbose=False
    )

# Predict
    xgb_test_preds = model.predict(X_test_fold)
   # train_preds = lr.predict(X_train_scaled)
    test_preds = ridge.predict(X_test_scaled)
    line_test_preds = lr.predict(X_test_scaled)

    ensemble_preds = (test_preds * 0.1 + xgb_test_preds* 0.3 + line_test_preds *0.6)

# Metrics
    #train_mae = mean_absolute_error(y_train_fold, train_preds)
    test_mae = mean_absolute_error(y_test_fold, ensemble_preds)
    test_line_mae = mean_absolute_error(y_test_fold, line_test_preds)
    test_ridge_mae = mean_absolute_error(y_test_fold, test_preds)
    test_xgb_mae = mean_absolute_error(y_test_fold, xgb_test_preds)
    fold_results.append({
        'fold': fold,
        #'train_mae': train_mae,
        'test_mae': test_mae,
        'test_line_mae': test_line_mae,
        'test_ridge_mae': test_ridge_mae,
        'test_xgb_mae': test_xgb_mae})

print(f" Test MAE = {test_mae:.4f}")
print(f" Test Line MAE = {test_line_mae:.4f}")
print(f" Test Ridge MAE = {test_ridge_mae:.4f}")
print(f" Test XGB MAE = {test_xgb_mae:.4f}")
# Summary
results_df = pd.DataFrame(fold_results)
print("\nGroupKFold results by fold:")
print(results_df)
print("\nAverage across 5 folds:")
print(results_df.mean(numeric_only=True))



Data set shape: (1812, 51)
Predict set shape: (453, 45)
Number of available default features: 24
['G', 'R', 'AB', 'H', 'HR', 'SO', 'SB', 'SV', 'ERA', 'CG', 'SHO', 'IPouts', 'HA', 'HRA', 'BBA', 'SOA', 'E', 'DP', 'FP', 'mlb_rpg', 'pyth_wins', 'run_diff_per_game', 'era_7', 'era_8']
Using 24 features: ['G', 'R', 'AB', 'H', 'HR', 'SO', 'SB', 'SV', 'ERA', 'CG', 'SHO', 'IPouts', 'HA', 'HRA', 'BBA', 'SOA', 'E', 'DP', 'FP', 'mlb_rpg', 'pyth_wins', 'run_diff_per_game', 'era_7', 'era_8']
Fold 1:
Fold 2:
Fold 3:
Fold 4:
Fold 5:
 Test MAE = 2.8794
 Test Line MAE = 2.8745
 Test Ridge MAE = 2.8741
 Test XGB MAE = 3.0313

GroupKFold results by fold:
   fold  test_mae  test_line_mae  test_ridge_mae  test_xgb_mae
0     1  2.728905       2.729393        2.732792      2.936844
1     2  2.725462       2.735606        2.735811      2.912866
2     3  2.853815       2.880186        2.879473      3.076895
3     4  2.661901       2.664642        2.666928      2.961760
4     5  2.879402       2.874454        2.8