In [45]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.model_selection import cross_val_score, KFold
from patsy import dmatrix
from sklearn.preprocessing import SplineTransformer
import xgboost as xgb

In [83]:
glogs_final = pd.read_csv('https://raw.githubusercontent.com/tmarchok1/DS440_project/refs/heads/Travis/glogs_final.csv')
glogs_final.head()

Unnamed: 0,date,year,week,day_of_week,v_name,h_name,day_night,park_id,temp,precip,capacity,prev_year_wins,made_playoffs,won_division,InstagramFollowers,CityPopulation,PayrollStd,attendance
0,2005-04-03,2005,1,Sun,BOS,NYY,1,Old Yankee Stadium,51.7,0.587,56937.0,101,1,1,3900000,19940274,2.830838,54818.0
1,2005-04-04,2005,1,Mon,OAK,BAL,0,Camden Yards,60.7,0.0,45971.0,78,0,0,746000,2859024,1.02505,48271.0
2,2005-04-04,2005,1,Mon,CLE,CHW,0,US Cellular Field,56.2,0.0,40615.0,83,0,0,664000,9408576,1.034087,38141.0
3,2005-04-04,2005,1,Mon,KC,DET,0,Comerica Park,59.0,0.0,41083.0,72,0,0,1000000,4400587,0.948452,44105.0
4,2005-04-04,2005,1,Mon,MIN,SEA,0,Safeco Field,50.1,0.13,47943.0,63,0,0,927000,4145494,1.180556,46249.0


In [3]:
# Create dummy model using only h_name variable
homedf = glogs_final.iloc[:23229].groupby('h_name')['attendance'].mean().sort_values().to_frame()
dummydf = pd.merge(glogs_final, homedf, on='h_name', how='left')
dummydf

# Evaluation metrics for dummy model
mse = mean_squared_error(dummydf['attendance_x'].iloc[23229:], dummydf['attendance_y'].iloc[23229:])
rmse = mse**(1/2)
r2 = r2_score(dummydf['attendance_x'].iloc[23229:], dummydf['attendance_y'].iloc[23229:])

print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'R-squared Score (R²): {r2:.2f}')

Mean Squared Error (MSE): 68362331.54
RMSE: 8268.15
R-squared Score (R²): 0.26


In [5]:
# Spline transform week into new encoded features
weeks = glogs_final[['week']]

# Fit sklearn spline transformer
spline = SplineTransformer(n_knots=5, degree=3, include_bias=False)
week_splined = spline.fit_transform(weeks)


# Create dataframe with transformed features
week_splined = pd.DataFrame(week_splined, columns=[f'week_spl{i}' for i in range(week_splined.shape[1])])
week_splined

Unnamed: 0,week_spl0,week_spl1,week_spl2,week_spl3,week_spl4,week_spl5
0,0.166667,0.666667,0.166667,0.0,0.000000,0.000000
1,0.166667,0.666667,0.166667,0.0,0.000000,0.000000
2,0.166667,0.666667,0.166667,0.0,0.000000,0.000000
3,0.166667,0.666667,0.166667,0.0,0.000000,0.000000
4,0.166667,0.666667,0.166667,0.0,0.000000,0.000000
...,...,...,...,...,...,...
29030,0.000000,0.000000,0.000000,0.0,0.166667,0.666667
29031,0.000000,0.000000,0.000000,0.0,0.166667,0.666667
29032,0.000000,0.000000,0.000000,0.0,0.166667,0.666667
29033,0.000000,0.000000,0.000000,0.0,0.166667,0.666667


In [14]:
# Create interaction variable between week and team
# Teams with average attendances greater than 36000 showed less variablility week to week AND less of a final week bump
avg_att = glogs_final.groupby('h_name')['attendance'].mean()
high_low = (avg_att > 36000).astype(int).map({1: 'high', 0: 'low'}).to_dict()


glogs_final['att_high_low'] = glogs_final['h_name'].map(high_low)
glogs_final['week_group'] = 'week_' + glogs_final['week'].astype(str) + '_' + glogs_final['att_high_low']
glogs_final = glogs_final.drop(columns=['att_high_low'])
glogs_final.head()

Unnamed: 0,date,year,week,day_of_week,v_name,h_name,day_night,park_id,temp,precip,capacity,prev_year_wins,made_playoffs,won_division,InstagramFollowers,CityPopulation,PayrollStd,attendance,week_group
0,2005-04-03,2005,1,Sun,BOS,NYY,1,Old Yankee Stadium,51.7,0.587,56937.0,101,1,1,3900000,19940274,2.830838,54818.0,week_1_high
1,2005-04-04,2005,1,Mon,OAK,BAL,0,Camden Yards,60.7,0.0,45971.0,78,0,0,746000,2859024,1.02505,48271.0,week_1_low
2,2005-04-04,2005,1,Mon,CLE,CHW,0,US Cellular Field,56.2,0.0,40615.0,83,0,0,664000,9408576,1.034087,38141.0,week_1_low
3,2005-04-04,2005,1,Mon,KC,DET,0,Comerica Park,59.0,0.0,41083.0,72,0,0,1000000,4400587,0.948452,44105.0,week_1_low
4,2005-04-04,2005,1,Mon,MIN,SEA,0,Safeco Field,50.1,0.13,47943.0,63,0,0,927000,4145494,1.180556,46249.0,week_1_low


In [84]:
# Data preprocessing

# Merge with splined week features
# glogs_final = pd.concat([glogs_final, week_splined], axis=1)
# glogs_final

# Drop columns
glogs_final = glogs_final.drop(columns=['date', 'temp'])

# Binary encode 'day_of_week' and 'precip'
# glogs_final['day_of_week'] = glogs_final['day_of_week'].map({'Mon':0, 'Tue':0, 'Wed':0, 'Thu':0, 'Fri':1, 'Sat':1, 'Sun':1}).astype('int')
glogs_final['precip'] = glogs_final['precip'].map(lambda x: 1 if x != 0 else 0)


# Apply OneHotEncoder to 'v_name', 'h_name', 'park_id' columns
encoder = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(sparse_output=False), ['week', 'h_name', 'v_name', 'park_id', 'day_of_week'])],
    
    # Keep other columns
    remainder='passthrough'
)

df_encoded = encoder.fit_transform(glogs_final)

# Rename columns
feature_names = encoder.get_feature_names_out()
feature_names = [name.replace("remainder__", "") for name in feature_names]

# Convert the result back to a DataFrame
df_encoded = pd.DataFrame(df_encoded, columns=feature_names)
df_encoded

Unnamed: 0,cat__week_1,cat__week_2,cat__week_3,cat__week_4,cat__week_5,cat__week_6,cat__week_7,cat__week_8,cat__week_9,cat__week_10,...,day_night,precip,capacity,prev_year_wins,made_playoffs,won_division,InstagramFollowers,CityPopulation,PayrollStd,attendance
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,56937.0,101.0,1.0,1.0,3900000.0,19940274.0,2.830838,54818.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,45971.0,78.0,0.0,0.0,746000.0,2859024.0,1.025050,48271.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,40615.0,83.0,0.0,0.0,664000.0,9408576.0,1.034087,38141.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,41083.0,72.0,0.0,0.0,1000000.0,4400587.0,0.948452,44105.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,47943.0,63.0,0.0,0.0,927000.0,4145494.0,1.180556,46249.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29030,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,50144.0,68.0,0.0,0.0,552000.0,3052498.0,0.811287,27762.0
29031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,43651.0,63.0,0.0,0.0,1300000.0,6330422.0,1.098502,36935.0
29032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,41915.0,84.0,0.0,0.0,1600000.0,4648486.0,1.374663,41445.0
29033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,45494.0,100.0,1.0,1.0,1100000.0,2811927.0,0.993265,44615.0


In [52]:
# Build general linear regression model
modeldata = df_encoded[:21776]
X = modeldata.drop(columns=['attendance'])
y = modeldata['attendance']
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
n_train = int(len(X) * 0.8)

X_train = X.iloc[:n_train]
y_train = y.iloc[:n_train]

X_test = X.iloc[n_train:]
y_test = y.iloc[n_train:]

# Initialize the model
model = LinearRegression()
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_pred = model.predict(X_test)

# Training metrics
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = mse_train ** 0.5
r2_train = r2_score(y_train, y_train_pred)

print(f'Training Mean Squared Error (MSE): {mse_train:.2f}')
print(f'Training RMSE: {rmse_train:.2f}')
print(f'Training R²: {r2_train:.2f}')

# Testing metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mse**(1/2)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'R-squared Score (R²): {r2:.2f}')

Training Mean Squared Error (MSE): 39651151.30
Training RMSE: 6296.92
Training R²: 0.67
Mean Squared Error (MSE): 38355358.12
RMSE: 6193.17
R-squared Score (R²): 0.60


In [None]:
# Build random forest model - 9673 cases from 2005-2008
modeldata = df_encoded[:9673]

# Define features (X) and target variable (y)
X = modeldata.drop(columns=['attendance'])
y = modeldata['attendance']

# Split data into training / validation / test (80% train, 10% val, 10% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)



# Initialize the Random Forest regressor
model = RandomForestRegressor(n_estimators=300, random_state=42, max_depth=None, min_samples_leaf=2, min_samples_split=5)
# Train the model
model.fit(X_train, y_train)

# Predict on the val set
y_pred = model.predict(X_test)

# Calculate Mean Squared Error (MSE) and R-squared score (R²)
mse = mean_squared_error(y_test, y_pred)
rmse = mse**(1/2)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'R-squared Score (R²): {r2:.2f}')

Mean Squared Error (MSE): 24124121.03
RMSE: 4911.63
R-squared Score (R²): 0.81


In [69]:
# Build Lasso regression model
modeldata = df_encoded[:21776]

X = modeldata.drop(columns=["attendance"])
y = modeldata["attendance"]


# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit Lasso Model
model = Lasso(alpha=0.1)
model.fit(X_scaled, y)

# Split train/test
#X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
n_train = int(len(X) * 0.8)

X_train = X_scaled[:n_train]
y_train = y[:n_train]

X_test = X_scaled[n_train:]
y_test = y[n_train:]

# Make predictions
y_train_pred = model.predict(X_train)
y_pred = model.predict(X_test)

# Training metrics
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = mse_train ** 0.5
r2_train = r2_score(y_train, y_train_pred)

print(f'Training Mean Squared Error (MSE): {mse_train:.2f}')
print(f'Training RMSE: {rmse_train:.2f}')
print(f'Training R-Squared: {r2_train:.2f}')

# Testing metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mse**(1/2)
r2 = r2_score(y_test, y_pred)

print(f'Test Mean Squared Error (MSE): {mse:.2f}')
print(f'Test RMSE: {rmse:.2f}')
print(f'Test R-squared: {r2:.2f}')

Training Mean Squared Error (MSE): 40071023.57
Training RMSE: 6330.17
Training R-Squared: 0.66
Test Mean Squared Error (MSE): 34750186.23
Test RMSE: 5894.93
Test R-squared: 0.64


  model = cd_fast.enet_coordinate_descent(


In [66]:
# Extract feature importance
feature_significance = pd.DataFrame({
    "Variable": X.columns,
    "Lasso_coef": model.coef_
})

# Rank features by absolute coefficient value
feature_significance["Abs_Coefficient"] = np.abs(feature_significance["Lasso_coef"])
feature_significance = feature_significance.sort_values(by="Abs_Coefficient", ascending=False)

# Display ranked features
feature_significance[["Variable", "Lasso_coef"]]

Unnamed: 0,Variable,Lasso_coef
70,PayrollStd,4511.191011
69,CityPopulation,2427.142268
56,cat__day_of_week_Sat,2132.900191
49,cat__h_name_STL,1697.922512
42,cat__h_name_NYY,-1430.995661
...,...,...
18,cat__week_19,-69.475741
24,cat__h_name_ARI,-64.429597
36,cat__h_name_LAA,60.728365
45,cat__h_name_PIT,26.579830


In [67]:
# Cross validation for Lasso model
lasso_cv = LassoCV(cv=5).fit(X_scaled, y)
print(f"Optimal alpha: {lasso_cv.alpha_}")

Optimal alpha: 35.35478233506005


In [34]:
# Build Lasso for making predictions
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train Lasso Regression model
lasso = Lasso(alpha=32.97198833400474)
lasso.fit(X_train, y_train)

# Predict on test data
y_pred = lasso.predict(X_test)



lasso_predictions = pd.DataFrame({'Actual':y_test, 'Predictions':y_pred})
lasso_predictions.head()

Unnamed: 0,Actual,Predictions
9352,21107.0,20367.281487
20640,36590.0,38890.471494
10429,30262.0,26952.728371
21621,30636.0,28391.087001
1503,28971.0,30616.394524


In [None]:
# Evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mse**(1/2)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f'RMSE: {rmse:.2f}')
print(f"R² Score: {r2:.2f}")

Mean Squared Error (MSE): 35640762.8194
RMSE: 5969.99
R² Score: 0.6816


In [66]:
# Convert predictions and actuals to DataFrames with matching indices
train_results = pd.DataFrame({
    'actual': y_train,
    'predicted': y_train_pred
}, index=y_train.index)

test_results = pd.DataFrame({
    'actual': y_test,
    'predicted': y_pred
}, index=y_test.index)


# Combine results
results_df = pd.concat([train_results, test_results]).sort_index().reset_index()
results_df

Unnamed: 0,index,actual,predicted
0,0,54818.0,53305.445771
1,1,48271.0,38851.046433
2,2,38141.0,28430.869302
3,3,44105.0,17843.517213
4,4,46249.0,35862.823685
...,...,...,...
21771,21771,28315.0,25541.019100
21772,21772,41891.0,32999.632505
21773,21773,41495.0,41662.148729
21774,21774,44808.0,44432.662060


In [137]:
# Build random forest model - 21776 cases from 2005-2013
modeldata = df_encoded[:21776]

# Define features (X) and target variable (y)
X = modeldata.drop(columns=['attendance'])
y = modeldata['attendance']

# Split data into training / test (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=29)
# n_train = int(len(X) * 0.8)

# X_train = X.iloc[:n_train]
# y_train = y.iloc[:n_train]

# X_test = X.iloc[n_train:]
# y_test = y.iloc[n_train:]

# Initialize the Random Forest regressor
model = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10, min_samples_leaf=3, min_samples_split=10)


# Fit the model
model.fit(X_train, y_train)

# Predict on the training set
y_train_pred = model.predict(X_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Training metrics
mse = mean_squared_error(y_train, y_train_pred)
rmse = mse**(1/2)
r2 = r2_score(y_train, y_train_pred)

print(f'Training Mean Squared Error (MSE): {mse:.2f}')
print(f'Training RMSE: {rmse:.2f}')
print(f'Training R-squared Score (R²): {r2:.2f}')


# Test metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mse**(1/2)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'R-squared Score (R²): {r2:.2f}')

Training Mean Squared Error (MSE): 26882570.40
Training RMSE: 5184.84
Training R-squared Score (R²): 0.76
Mean Squared Error (MSE): 30996346.65
RMSE: 5567.44
R-squared Score (R²): 0.73


In [None]:
# Random Forest hyperparameter tuning
# Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}
# Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}


In [132]:
# xgboost model - 21776 cases from 2005-2013
modeldata = df_encoded[:21776]

# Define features (X) and target variable (y)
X = modeldata.drop(columns=['attendance'])
y = modeldata['attendance']

# Split data into training / test (80% train, 20% test)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=29)
n_train = int(len(X) * 0.8)

X_train = X.iloc[:n_train]
y_train = y.iloc[:n_train]

X_test = X.iloc[n_train:]
y_test = y.iloc[n_train:]

In [133]:
# Initialize the XGBoost regressor
xgb_model = xgb.XGBRegressor(
    n_estimators=300,
    max_depth=9,
    learning_rate=0.1,
    subsample=1,
    colsample_bytree=0.8,
    min_child_weight=4,
    
    random_state=42
)

In [134]:
# Fit the model
xgb_model.fit(X_train, y_train)

# Predict on training set
y_train_pred = xgb_model.predict(X_train)

# Predict on test set
y_pred = xgb_model.predict(X_test)

In [135]:
# Training metrics
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = mse_train ** 0.5
r2_train = r2_score(y_train, y_train_pred)

print(f'Training Mean Squared Error (MSE): {mse_train:.2f}')
print(f'Training RMSE: {rmse_train:.2f}')
print(f'Training R²: {r2_train:.2f}')

# Testing metrics
mse_test = mean_squared_error(y_test, y_pred)
rmse_test = mse_test ** 0.5
r2_test = r2_score(y_test, y_pred)

print(f'Test Mean Squared Error (MSE): {mse_test:.2f}')
print(f'Test RMSE: {rmse_test:.2f}')
print(f'Test R²: {r2_test:.2f}')

Training Mean Squared Error (MSE): 5467382.06
Training RMSE: 2338.24
Training R²: 0.95
Test Mean Squared Error (MSE): 30659199.54
Test RMSE: 5537.07
Test R²: 0.68


In [70]:
modeldata

Unnamed: 0,cat__week_1,cat__week_2,cat__week_3,cat__week_4,cat__week_5,cat__week_6,cat__week_7,cat__week_8,cat__week_9,cat__week_10,...,year,day_night,precip,capacity,prev_year_wins,made_playoffs,won_division,InstagramFollowers,CityPopulation,attendance
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2005.0,1.0,1.0,56937.0,101.0,1.0,1.0,3900000.0,19940274.0,54818.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2005.0,0.0,0.0,45971.0,78.0,0.0,0.0,746000.0,2859024.0,48271.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2005.0,0.0,0.0,40615.0,83.0,0.0,0.0,664000.0,9408576.0,38141.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2005.0,0.0,0.0,41083.0,72.0,0.0,0.0,1000000.0,4400587.0,44105.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2005.0,0.0,1.0,47943.0,63.0,0.0,0.0,927000.0,4145494.0,46249.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21771,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2013.0,0.0,1.0,36742.0,69.0,0.0,0.0,783000.0,6457988.0,28315.0
21772,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2013.0,0.0,0.0,41922.0,74.0,0.0,0.0,1700000.0,19940274.0,41891.0
21773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2013.0,0.0,0.0,41915.0,94.0,1.0,1.0,1600000.0,4648486.0,41495.0
21774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2013.0,0.0,1.0,45494.0,88.0,1.0,0.0,1100000.0,2811927.0,44808.0


In [82]:
# Error analysis
modeldata = df_encoded[:21776].reset_index()

errordata = pd.merge(pd.merge(modeldata, glogs_final[['year', 'day_of_week', 'week', 'h_name', 'v_name', 'park_id']].reset_index(), how='left', on='index'), results_df, how='left', on='index').drop(columns=['index', 'attendance'])

errordata['residual'] = errordata['actual'] - errordata['predicted']
errordata['absresidual'] = np.sqrt(np.square(errordata['residual']))
errordata

Unnamed: 0,cat__week_1,cat__week_2,cat__week_3,cat__week_4,cat__week_5,cat__week_6,cat__week_7,cat__week_8,cat__week_9,cat__week_10,...,year_y,day_of_week_y,week,h_name,v_name,park_id,actual,predicted,residual,absresidual
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2005,Sun,1,NYY,BOS,Old Yankee Stadium,54818.0,53305.445771,1512.554229,1512.554229
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2005,Mon,1,BAL,OAK,Camden Yards,48271.0,38851.046433,9419.953567,9419.953567
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2005,Mon,1,CHW,CLE,US Cellular Field,38141.0,28430.869302,9710.130698,9710.130698
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2005,Mon,1,DET,KC,Comerica Park,44105.0,17843.517213,26261.482787,26261.482787
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2005,Mon,1,SEA,MIN,Safeco Field,46249.0,35862.823685,10386.176315,10386.176315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21771,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2013,Sun,24,MIA,DET,Marlins Park,28315.0,25541.019100,2773.980900,2773.980900
21772,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2013,Sun,24,NYM,MIL,Citi Field,41891.0,32999.632505,8891.367495,8891.367495
21773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2013,Sun,24,SF,SD,AT&T Park,41495.0,41662.148729,-167.148729,167.148729
21774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2013,Sun,24,STL,CHC,Busch Stadium III,44808.0,44432.662060,375.337940,375.337940


In [None]:
errordata.to_csv("errordata_final.csv", index=False)