In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

def add_strong_features(df):
    # Core run-related (usually the strongest predictors)
    df['run_diff'] = df['R'] - df['RA']
    df['run_diff_per_game'] = df['run_diff'] / df['G']
    
    # Pythagorean (modern exponent)
    exp = 1.83
    df['pyth_exp'] = np.where(
        df['R'] + df['RA'] > 0,
        df['R']**exp / (df['R']**exp + df['RA']**exp),
        0.5
    )
    df['pyth_wins'] = np.round(df['G'] * df['pyth_exp']).astype(int)
    
    # Per-game rates (helps across eras)
    df['R_per_game']  = df['R'] / df['G']
    df['RA_per_game'] = df['RA'] / df['G']
    
    # Pitching quality
    df['IP'] = df['IPouts'] / 3.0 + 1e-6          # avoid division by zero
    df['SOA_per_game'] = df['SOA'] / df['IP']
    df['WHIP'] = (df['HA'] + df['BBA']) / df['IP']
    
    # Other useful differentials & adjustments
    df['HR_diff'] = df['HR'] - df['HRA']
    df['ERA_adj'] = df['ERA'] - df['mlb_rpg'] * 9     # rough league adjustment
    
    # Late-game / bullpen strength
    # df['SV_rate'] = df['SV'] / (df['SV'] + df['BS'] + 1e-6)  # if you have blown saves
    
    # Market/team quality proxy
    df['attendance_per_game'] = df['attendance'] / df['G']
    
    return df

# Load the pre-processed train and test datasets
#DATAPATH = "sctpdsai-m-3-ds-3-coaching-money-ball-analytics"
BASE = f"input/"
data_df = pd.read_csv(os.path.join(BASE, "data.csv"))
predict_df = pd.read_csv(os.path.join(BASE, "predict.csv"))

# Display basic information about the datasets
print(f"Data set shape: {data_df.shape}")
print(f"Predict set shape: {predict_df.shape}")



Data set shape: (1812, 51)
Predict set shape: (453, 45)


In [None]:
default_features = [
    # Basic Statistics
    'G', 'R', 'AB', 'H', '2B', '3B', 'HR', 'BB', 'SO', 'SB', 'CS', 'HBP', 'SF',
    'RA', 'ER', 'ERA', 'CG', 'SHO', 'SV', 'IPouts', 'HA', 'HRA', 'BBA', 'SOA',
    'E', 'DP', 'FP', 'attendance', 'BPF', 'PPF',
    
    # Derived Features
    'R_per_game', 'RA_per_game', 'mlb_rpg',
    
    # Era Indicators
    'era_1', 'era_2', 'era_3', 'era_4', 'era_5', 'era_6', 'era_7', 'era_8',
    
    # Decade Indicators
    'decade_1910', 'decade_1920', 'decade_1930', 'decade_1940', 'decade_1950',
    'decade_1960', 'decade_1970', 'decade_1980', 'decade_1990', 'decade_2000', 'decade_2010'
]

strong_features = [
    'run_diff',
    'run_diff_per_game',
    'pyth_wins',
    'pyth_exp',
    'R_per_game',
    'RA_per_game',
    'ERA',
    'WHIP',
    'SOA_per_game',
    'HR_diff',           # added — often stronger than separate HR/HRA
    'HRA',
    'SV',
    'SHO',
    'mlb_rpg',
    'ERA_adj',
    'era_8',
    'attendance_per_game',
    # Optional high-value extras (add if your CV likes them):
    # 'FP', 'DP', 'E', 'BB', 'SO'
]

# Filter features that exist in both datasets
available_features = [col for col in default_features if col in data_df.columns and col in predict_df.columns]
print(f"Number of available default features: {len(available_features)}")
print(available_features)


data_df   = add_strong_features(data_df)
predict_df = add_strong_features(predict_df)

print("Strong features ready:", strong_features)

# Separate features and target variable
X = data_df[available_features]
y = data_df['W']

# Perform the split (adjust test_size / random_state as needed)
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,      # 20% for testing
    random_state=42    # ensures reproducibility
)

Number of available default features: 44
['G', 'R', 'AB', 'H', '2B', '3B', 'HR', 'BB', 'SO', 'SB', 'RA', 'ER', 'ERA', 'CG', 'SHO', 'SV', 'IPouts', 'HA', 'HRA', 'BBA', 'SOA', 'E', 'DP', 'FP', 'mlb_rpg', 'era_1', 'era_2', 'era_3', 'era_4', 'era_5', 'era_6', 'era_7', 'era_8', 'decade_1910', 'decade_1920', 'decade_1930', 'decade_1940', 'decade_1950', 'decade_1960', 'decade_1970', 'decade_1980', 'decade_1990', 'decade_2000', 'decade_2010']


In [None]:

# Scale features
# Identify columns to exclude from scaling (one-hot encoded and label columns)
one_hot_cols = [col for col in X_train.columns if col.startswith(('era_', 'decade_'))]
other_cols = [col for col in X_train.columns if col not in one_hot_cols]

# Scale only non-one-hot features
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[other_cols] = scaler.fit_transform(X_train[other_cols])
X_test_scaled[other_cols] = scaler.transform(X_test[other_cols])

In [None]:
# Build Linear Regression Model
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
lr_train_preds = lr.predict(X_train_scaled)
lr_test_preds = lr.predict(X_test_scaled)

In [None]:
# Evaluate Linear Regression
lr_train_mae = mean_absolute_error(y_train, lr_train_preds)
lr_test_mae = mean_absolute_error(y_test, lr_test_preds)
lr_test_rmse = np.sqrt(mean_squared_error(y_test, lr_test_preds))
lr_test_r2 = r2_score(y_test, lr_test_preds)

print(f"Linear Regression Performance:")
print(f"  Training MAE: {lr_train_mae:.4f}")
print(f"  Test MAE: {lr_test_mae:.4f}")
print(f"  Test RMSE: {lr_test_rmse:.4f}")
print(f"  Test R²: {lr_test_r2:.4f}")

# Feature importance from Linear Regression
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': lr.coef_
}).sort_values('Coefficient', key=abs, ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

In [None]:
# Visualize actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, lr_test_preds, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Wins')
plt.ylabel('Predicted Wins')
plt.title('Linear Regression: Actual vs Predicted Wins')
plt.grid(True, alpha=0.3)

# Add residual plot
plt.figure(figsize=(10, 6))
residuals = y_test - lr_test_preds
plt.scatter(lr_test_preds, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Wins')
plt.ylabel('Residuals')
plt.title('Linear Regression: Residual Plot')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Prepare Submission

# Align features and scale using the previously fitted scaler
predict_scaled = predict_df[available_features].copy()
predict_scaled[other_cols] = scaler.transform(predict_scaled[other_cols])

# Predict wins for the new dataset
predict_preds = lr.predict(predict_scaled)

# Build submission in the same format as submission.csv
submission_df = pd.DataFrame({
    'ID': predict_df['ID'],
    'W': np.round(predict_preds).astype(int)
})

submission_path = 'submission_predict.csv'
submission_df.to_csv(submission_path, index=False)
print(f"Kaggle submission saved to {submission_path}")