In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.model_selection import cross_val_score, KFold
from patsy import dmatrix
from sklearn.preprocessing import SplineTransformer

In [13]:
glogs_final = pd.read_csv('https://raw.githubusercontent.com/tmarchok1/DS440_project/refs/heads/Travis/glogs_final.csv')
glogs_final

Unnamed: 0,date,year,week,day_of_week,v_name,h_name,day_night,park_id,temp,precip,capacity,prev_year_wins,made_playoffs,won_division,InstagramFollowers,CityPopulation,PayrollStd,attendance
0,2005-04-03,2005,1,Sun,BOS,NYY,1,Old Yankee Stadium,51.7,0.587,56937.0,101,1,1,3900000,19940274,2.830838,54818.0
1,2005-04-04,2005,1,Mon,OAK,BAL,0,Camden Yards,60.7,0.000,45971.0,78,0,0,746000,2859024,1.025050,48271.0
2,2005-04-04,2005,1,Mon,CLE,CHW,0,US Cellular Field,56.2,0.000,40615.0,83,0,0,664000,9408576,1.034087,38141.0
3,2005-04-04,2005,1,Mon,KC,DET,0,Comerica Park,59.0,0.000,41083.0,72,0,0,1000000,4400587,0.948452,44105.0
4,2005-04-04,2005,1,Mon,MIN,SEA,0,Safeco Field,50.1,0.130,47943.0,63,0,0,927000,4145494,1.180556,46249.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29030,2016-10-02,2016,24,Sun,MIL,COL,0,Coors Field,78.8,0.000,50144.0,68,0,0,552000,3052498,0.811287,27762.0
29031,2016-10-02,2016,24,Sun,NYM,PHI,0,Citizens Bank Park,70.5,0.039,43651.0,63,0,0,1300000,6330422,1.098502,36935.0
29032,2016-10-02,2016,24,Sun,LAD,SF,0,AT&T Park,66.0,0.000,41915.0,84,0,0,1600000,4648486,1.374663,41445.0
29033,2016-10-02,2016,24,Sun,PIT,STL,0,Busch Stadium III,69.3,0.028,45494.0,100,1,1,1100000,2811927,0.993265,44615.0


In [4]:
# Create dummy model using only h_name variable
homedf = glogs_final.groupby('h_name')['attendance'].mean().sort_values().to_frame()
dummydf = pd.merge(glogs_final, homedf, on='h_name', how='left')
dummydf

# Evaluation metrics for dummy model
mse = mean_squared_error(dummydf['attendance_x'], dummydf['attendance_y'])
rmse = mse**(1/2)
r2 = r2_score(dummydf['attendance_x'], dummydf['attendance_y'])

print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'R-squared Score (R²): {r2:.2f}')

Mean Squared Error (MSE): 58894063.76
RMSE: 7674.25
R-squared Score (R²): 0.46


In [5]:
# Spline transform week into new encoded features
weeks = glogs_final[['week']]

# Fit sklearn spline transformer
spline = SplineTransformer(n_knots=5, degree=3, include_bias=False)
week_splined = spline.fit_transform(weeks)


# Create dataframe with transformed features
week_splined = pd.DataFrame(week_splined, columns=[f'week_spl{i}' for i in range(week_splined.shape[1])])
week_splined

Unnamed: 0,week_spl0,week_spl1,week_spl2,week_spl3,week_spl4,week_spl5
0,0.166667,0.666667,0.166667,0.0,0.000000,0.000000
1,0.166667,0.666667,0.166667,0.0,0.000000,0.000000
2,0.166667,0.666667,0.166667,0.0,0.000000,0.000000
3,0.166667,0.666667,0.166667,0.0,0.000000,0.000000
4,0.166667,0.666667,0.166667,0.0,0.000000,0.000000
...,...,...,...,...,...,...
29030,0.000000,0.000000,0.000000,0.0,0.166667,0.666667
29031,0.000000,0.000000,0.000000,0.0,0.166667,0.666667
29032,0.000000,0.000000,0.000000,0.0,0.166667,0.666667
29033,0.000000,0.000000,0.000000,0.0,0.166667,0.666667


In [14]:
# Create interaction variable between week and team
# Teams with average attendances greater than 36000 showed less variablility week to week AND less of a final week bump
avg_att = glogs_final.groupby('h_name')['attendance'].mean()
high_low = (avg_att > 36000).astype(int).map({1: 'high', 0: 'low'}).to_dict()


glogs_final['att_high_low'] = glogs_final['h_name'].map(high_low)
glogs_final['week_group'] = 'week_' + glogs_final['week'].astype(str) + '_' + glogs_final['att_high_low']
glogs_final = glogs_final.drop(columns=['att_high_low'])
glogs_final.head()

Unnamed: 0,date,year,week,day_of_week,v_name,h_name,day_night,park_id,temp,precip,capacity,prev_year_wins,made_playoffs,won_division,InstagramFollowers,CityPopulation,PayrollStd,attendance,week_group
0,2005-04-03,2005,1,Sun,BOS,NYY,1,Old Yankee Stadium,51.7,0.587,56937.0,101,1,1,3900000,19940274,2.830838,54818.0,week_1_high
1,2005-04-04,2005,1,Mon,OAK,BAL,0,Camden Yards,60.7,0.0,45971.0,78,0,0,746000,2859024,1.02505,48271.0,week_1_low
2,2005-04-04,2005,1,Mon,CLE,CHW,0,US Cellular Field,56.2,0.0,40615.0,83,0,0,664000,9408576,1.034087,38141.0,week_1_low
3,2005-04-04,2005,1,Mon,KC,DET,0,Comerica Park,59.0,0.0,41083.0,72,0,0,1000000,4400587,0.948452,44105.0,week_1_low
4,2005-04-04,2005,1,Mon,MIN,SEA,0,Safeco Field,50.1,0.13,47943.0,63,0,0,927000,4145494,1.180556,46249.0,week_1_low


In [15]:
# Data preprocessing

# Merge with splined week features
# glogs_final = pd.concat([glogs_final, week_splined], axis=1)
# glogs_final

# Drop columns
glogs_final = glogs_final.drop(columns=['date', 'temp'])

# Binary encode 'day_of_week' and 'precip'
# glogs_final['day_of_week'] = glogs_final['day_of_week'].map({'Mon':0, 'Tue':0, 'Wed':0, 'Thu':0, 'Fri':1, 'Sat':1, 'Sun':1}).astype('int')
glogs_final['precip'] = glogs_final['precip'].map(lambda x: 1 if x != 0 else 0)


# Apply OneHotEncoder to 'v_name', 'h_name', 'park_id' columns
encoder = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(sparse_output=False), ['week_group', 'v_name', 'h_name', 'park_id', 'day_of_week'])],
    
    # Keep other columns
    remainder='passthrough'
)

df_encoded = encoder.fit_transform(glogs_final)

# Rename columns
feature_names = encoder.get_feature_names_out()
feature_names = [name.replace("remainder__", "") for name in feature_names]

# Convert the result back to a DataFrame (optional)
df_encoded = pd.DataFrame(df_encoded, columns=feature_names)
df_encoded

Unnamed: 0,cat__week_group_week_10_high,cat__week_group_week_10_low,cat__week_group_week_11_high,cat__week_group_week_11_low,cat__week_group_week_12_high,cat__week_group_week_12_low,cat__week_group_week_13_high,cat__week_group_week_13_low,cat__week_group_week_14_high,cat__week_group_week_14_low,...,day_night,precip,capacity,prev_year_wins,made_playoffs,won_division,InstagramFollowers,CityPopulation,PayrollStd,attendance
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,56937.0,101.0,1.0,1.0,3900000.0,19940274.0,2.830838,54818.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,45971.0,78.0,0.0,0.0,746000.0,2859024.0,1.025050,48271.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,40615.0,83.0,0.0,0.0,664000.0,9408576.0,1.034087,38141.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,41083.0,72.0,0.0,0.0,1000000.0,4400587.0,0.948452,44105.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,47943.0,63.0,0.0,0.0,927000.0,4145494.0,1.180556,46249.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29030,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,50144.0,68.0,0.0,0.0,552000.0,3052498.0,0.811287,27762.0
29031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,43651.0,63.0,0.0,0.0,1300000.0,6330422.0,1.098502,36935.0
29032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,41915.0,84.0,0.0,0.0,1600000.0,4648486.0,1.374663,41445.0
29033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,45494.0,100.0,1.0,1.0,1100000.0,2811927.0,0.993265,44615.0


In [None]:
# Build random forest model - 9673 cases from 2005-2008
modeldata = df_encoded[:9673]

# Define features (X) and target variable (y)
X = modeldata.drop(columns=['attendance'])
y = modeldata['attendance']

# Split data into training / validation / test (80% train, 10% val, 10% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)
#X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


# Initialize the Random Forest regressor
model = RandomForestRegressor(n_estimators=300, random_state=42, max_depth=None, min_samples_leaf=2, min_samples_split=5)
# Train the model
model.fit(X_train, y_train)

# Predict on the val set
y_pred = model.predict(X_test)

# Calculate Mean Squared Error (MSE) and R-squared score (R²)
mse = mean_squared_error(y_test, y_pred)
rmse = mse**(1/2)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'R-squared Score (R²): {r2:.2f}')

Mean Squared Error (MSE): 23618561.00
RMSE: 4859.89
R-squared Score (R²): 0.81


In [None]:
# Random Forest hyperparameter tuning
# Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}
# Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}


In [16]:
# Build random forest model - 21776 cases from 2005-2013
modeldata = df_encoded[:21776]

# Define features (X) and target variable (y)
X = modeldata.drop(columns=['attendance'])
y = modeldata['attendance']

# Split data into training / test (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=29)

# Initialize the Random Forest regressor
model = RandomForestRegressor(n_estimators=300, random_state=42, max_depth=30, min_samples_leaf=2, min_samples_split=5)


# Fit the model
model.fit(X_train, y_train)

# Predict on the training set
y_train_pred = model.predict(X_train)

# Calculate training Mean Squared Error (MSE) and R-squared score (R²)
mse = mean_squared_error(y_train, y_train_pred)
rmse = mse**(1/2)
r2 = r2_score(y_train, y_train_pred)

print(f'Training Mean Squared Error (MSE): {mse:.2f}')
print(f'Training RMSE: {rmse:.2f}')
print(f'Training R-squared Score (R²): {r2:.2f}')



# Predict on the test set
y_pred = model.predict(X_test)

# Calculate Mean Squared Error (MSE) and R-squared score (R²)
mse = mean_squared_error(y_test, y_pred)
rmse = mse**(1/2)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'R-squared Score (R²): {r2:.2f}')

Training Mean Squared Error (MSE): 7201802.81
Training RMSE: 2683.62
Training R-squared Score (R²): 0.94
Mean Squared Error (MSE): 21492612.98
RMSE: 4636.01
R-squared Score (R²): 0.81


In [6]:
# Build Lasso regression model
modeldata = df_encoded[:21776]

X = modeldata.drop(columns=["attendance"])
y = modeldata["attendance"]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit Lasso Regression
lasso = Lasso(alpha=0.1)  # Adjust alpha to control regularization strength
lasso.fit(X_scaled, y)

# Extract feature importance
feature_significance = pd.DataFrame({
    "Variable": X.columns,
    "Lasso_coef": lasso.coef_
})

# Rank features by absolute coefficient value
feature_significance["Abs_Coefficient"] = np.abs(feature_significance["Lasso_coef"])
feature_significance = feature_significance.sort_values(by="Abs_Coefficient", ascending=False)

# Display ranked features
feature_significance[["Variable", "Lasso_coef"]].tail(50)

  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,Variable,Lasso_coef
34,cat__v_name_HOU,-147.24034
0,cat__week_1,146.357099
12,cat__week_13,137.102897
86,cat__park_id_Busch Stadium II,-134.452418
17,cat__week_18,129.979135
39,cat__v_name_MIL,-126.819271
6,cat__week_7,-121.79983
85,cat__park_id_Angel Stadium,121.20183
24,cat__v_name_ARI,-116.325067
32,cat__v_name_COL,-115.483586


In [None]:
# Cross validation for Lasso model
lasso_cv = LassoCV(cv=5).fit(X_scaled, y)
print(f"Optimal alpha: {lasso_cv.alpha_}")

Optimal alpha: 31.640845472570195


In [None]:
# Build Lasso for making predictions
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train Lasso Regression model
lasso = Lasso(alpha=31.640845472570195)
lasso.fit(X_train, y_train)

# Predict on test data
y_pred = lasso.predict(X_test)



lasso_predictions = pd.DataFrame({'Actual':y_test, 'Predictions':y_pred})
lasso_predictions.head()

Unnamed: 0,Actual,Predictions
9352,21107.0,21002.361363
20640,36590.0,37749.284679
10429,30262.0,27141.231195
21621,30636.0,29003.284402
1503,28971.0,29834.321874


In [11]:
# Evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mse**(1/2)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f'RMSE: {rmse:.2f}')
print(f"R² Score: {r2:.4f}")

Mean Squared Error (MSE): 36430498.7093
RMSE: 6035.77
R² Score: 0.6745


In [66]:
# Convert predictions and actuals to DataFrames with matching indices
train_results = pd.DataFrame({
    'actual': y_train,
    'predicted': y_train_pred
}, index=y_train.index)

test_results = pd.DataFrame({
    'actual': y_test,
    'predicted': y_pred
}, index=y_test.index)


# Combine results
results_df = pd.concat([train_results, test_results]).sort_index().reset_index()
results_df

Unnamed: 0,index,actual,predicted
0,0,54818.0,53305.445771
1,1,48271.0,38851.046433
2,2,38141.0,28430.869302
3,3,44105.0,17843.517213
4,4,46249.0,35862.823685
...,...,...,...
21771,21771,28315.0,25541.019100
21772,21772,41891.0,32999.632505
21773,21773,41495.0,41662.148729
21774,21774,44808.0,44432.662060


In [82]:
# Error analysis
modeldata = df_encoded[:21776].reset_index()

errordata = pd.merge(pd.merge(modeldata, glogs_final[['year', 'day_of_week', 'week', 'h_name', 'v_name', 'park_id']].reset_index(), how='left', on='index'), results_df, how='left', on='index').drop(columns=['index', 'attendance'])

errordata['residual'] = errordata['actual'] - errordata['predicted']
errordata['absresidual'] = np.sqrt(np.square(errordata['residual']))
errordata

Unnamed: 0,cat__week_1,cat__week_2,cat__week_3,cat__week_4,cat__week_5,cat__week_6,cat__week_7,cat__week_8,cat__week_9,cat__week_10,...,year_y,day_of_week_y,week,h_name,v_name,park_id,actual,predicted,residual,absresidual
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2005,Sun,1,NYY,BOS,Old Yankee Stadium,54818.0,53305.445771,1512.554229,1512.554229
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2005,Mon,1,BAL,OAK,Camden Yards,48271.0,38851.046433,9419.953567,9419.953567
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2005,Mon,1,CHW,CLE,US Cellular Field,38141.0,28430.869302,9710.130698,9710.130698
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2005,Mon,1,DET,KC,Comerica Park,44105.0,17843.517213,26261.482787,26261.482787
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2005,Mon,1,SEA,MIN,Safeco Field,46249.0,35862.823685,10386.176315,10386.176315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21771,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2013,Sun,24,MIA,DET,Marlins Park,28315.0,25541.019100,2773.980900,2773.980900
21772,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2013,Sun,24,NYM,MIL,Citi Field,41891.0,32999.632505,8891.367495,8891.367495
21773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2013,Sun,24,SF,SD,AT&T Park,41495.0,41662.148729,-167.148729,167.148729
21774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2013,Sun,24,STL,CHC,Busch Stadium III,44808.0,44432.662060,375.337940,375.337940
