In [1]:
from typing import Tuple

import numpy as np
import pandas as pd
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
import statsmodels.formula.api as smf
# from tensorflow import keras

In [2]:
seed = 13

## Single Layer Network on the Hitters Data

In [3]:
df_hit = pd.read_csv('../data/hitters.csv')
df_gitter = df_hit.dropna()

In [4]:
df_gitter.head(2)

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A


In [5]:
df_gitter.shape

(263, 20)

In [6]:
test_id = df_gitter.sample(frac=0.33, random_state=seed).index
train_id_bool = ~df_gitter.index.isin(test_id)
gitter_train, gitter_test = df_gitter.loc[train_id_bool, :], df_gitter.loc[test_id, :]

### Linear Regression

In [7]:
# https://stackoverflow.com/questions/35518477/statsmodels-short-way-of-writing-formula
def formula_from_cols(df: pd.DataFrame, y: str) -> str:
    return y + ' ~ ' + ' + '.join([col for col in df.columns if not col==y])

In [8]:
gitter_formula = formula_from_cols(df_gitter, 'Salary')
print(gitter_formula)

Salary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years + CAtBat + CHits + CHmRun + CRuns + CRBI + CWalks + League + Division + PutOuts + Assists + Errors + NewLeague


In [9]:
l_fit = smf.ols(gitter_formula, data=gitter_train).fit()
l_pred = l_fit.predict(gitter_test).values
lm_gitter_mae = np.mean(np.abs(l_pred - gitter_test.Salary.values))
print(lm_gitter_mae)

242.99282793560906


### Lasso Regression

In [10]:
def get_x_y_gitter(df: pd.DataFrame) -> Tuple:
    gitter_obj_col = list(df_gitter.select_dtypes('object').columns)
    x = pd.get_dummies(df, columns = gitter_obj_col, drop_first=True)\
             .drop('Salary', axis=1)
    y = df['Salary']
    return x, y

In [11]:
gitter_train_x, gitter_train_y = get_x_y_gitter(gitter_train)
gitter_test_x, gitter_test_y = get_x_y_gitter(gitter_test)

In [12]:
def scale_to_df(df: pd.DataFrame, 
                std_scale: StandardScaler) -> pd.DataFrame:
    return pd.DataFrame(std_scale.transform(df), columns = list(df.columns))

In [13]:
std_scale = StandardScaler()
_ = std_scale.fit(gitter_train_x)
gitter_train_x_scaled = scale_to_df(gitter_train_x, std_scale)
gitter_test_x_scaled = scale_to_df(gitter_test_x, std_scale)

In [14]:
cv_fit = LassoCV(cv=5, random_state=seed).fit(gitter_train_x_scaled, 
                                              gitter_train_y)

In [15]:
c_pred = cv_fit.predict(gitter_test_x_scaled)
np.mean(np.abs(gitter_test_y - c_pred))

223.45749081018576

### Neural Network