In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.linear_model import RidgeCV,Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score
import watermark
plt.style.use('fivethirtyeight')
%matplotlib inline

In [2]:
%load_ext watermark
%watermark -n -u -v -iv -w -a sushmit86@gmail.com

Author: sushmit86@gmail.com

Last updated: Thu Apr 08 2021

Python implementation: CPython
Python version       : 3.8.2
IPython version      : 7.22.0

numpy     : 1.19.2
watermark : 2.1.0
pandas    : 1.2.3
seaborn   : 0.11.1
matplotlib: 3.3.4

Watermark: 2.1.0



## Reading Data

In [7]:
df_hitters = pd.read_csv('Data/Hitters.csv')
display(df_hitters.isna().any())
df_hitters_to_model = df_hitters.copy()
df_hitters_to_model.dropna(inplace=True)
display(df_hitters_to_model.shape)

AtBat        False
Hits         False
HmRun        False
Runs         False
RBI          False
Walks        False
Years        False
CAtBat       False
CHits        False
CHmRun       False
CRuns        False
CRBI         False
CWalks       False
League       False
Division     False
PutOuts      False
Assists      False
Errors       False
Salary        True
NewLeague    False
dtype: bool

(263, 20)

In [15]:
df_hitters_to_model = pd.get_dummies(df_hitters_to_model,drop_first=True)
df_hitters_to_model.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,Salary,League_N,Division_W,NewLeague_N
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,632,43,10,475.0,1,1,1
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,880,82,14,480.0,0,1,0
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,200,11,3,500.0,1,0,1
4,321,87,10,39,42,30,2,396,101,12,48,46,33,805,40,4,91.5,1,0,1
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,282,421,25,750.0,0,1,0


## Splitting Test/Train

In [50]:
train,test = train_test_split(df_hitters_to_model,test_size= 0.5,random_state=1)
feature_columns = list(train.columns)
feature_columns.remove('Salary')
X_train = train.loc[:,feature_columns]
y_train = train.Salary
X_test = test.loc[:,feature_columns]
y_test = test.Salary

scaler = StandardScaler()
X_train_std = pd.DataFrame(scaler.fit_transform(X_train),columns = X_train.columns)
X_test_std = pd.DataFrame(scaler.transform(X_test),columns = X_test.columns)

## Fitting RidgeCV

In [51]:
lambda_grid = 10** np.linspace(10,-2,100)
model = RidgeCV(alphas=lambda_grid,cv=5,scoring='r2')
model.fit(X_train_std,y_train)
print('Best Model coef', model.coef_)
best_aplha = model.alpha_
print('Best Alpha',best_aplha)
print('Best Score',model.best_score_)

Best Model coef [  3.91135935  36.09635974   1.73668011  19.61165904  32.21913156
  43.97240993   8.49644681  17.99283943  32.5450565   41.51292542
  33.27193592  41.57133374  25.53583306  75.76136575  -2.47595326
  -0.79266655   8.2142983  -41.60836804   5.21195537]
Best Alpha 100.0
Best Score 0.3984544571169031


In [52]:
best_model = Ridge(alpha=best_aplha,random_state=1)
best_model.fit(X_train_std,y_train)

print('R2 Train', r2_score(y_train,best_model.predict(X_train_std)))
print('R2 Test', r2_score(y_test,best_model.predict(X_test_std)))

R2 Train 0.5005517473001011
R2 Test 0.42375938999439855


In [66]:
hyper_parameter = {'alpha': lambda_grid}
ridge_model = Ridge(random_state=1)
gridsearch = GridSearchCV(ridge_model, hyper_parameter, cv=5, verbose=1,scoring='r2')
gridsearch.fit(X_train_std,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


GridSearchCV(cv=5, estimator=Ridge(random_state=1),
             param_grid={'alpha': array([1.00000000e+10, 7.56463328e+09, 5.72236766e+09, 4.32876128e+09,
       3.27454916e+09, 2.47707636e+09, 1.87381742e+09, 1.41747416e+09,
       1.07226722e+09, 8.11130831e+08, 6.13590727e+08, 4.64158883e+08,
       3.51119173e+08, 2.65608778e+08, 2.00923300e+08, 1.51991108e+08,
       1.14975700e+08, 8.69749003e+07, 6.5...
       6.13590727e+00, 4.64158883e+00, 3.51119173e+00, 2.65608778e+00,
       2.00923300e+00, 1.51991108e+00, 1.14975700e+00, 8.69749003e-01,
       6.57933225e-01, 4.97702356e-01, 3.76493581e-01, 2.84803587e-01,
       2.15443469e-01, 1.62975083e-01, 1.23284674e-01, 9.32603347e-02,
       7.05480231e-02, 5.33669923e-02, 4.03701726e-02, 3.05385551e-02,
       2.31012970e-02, 1.74752840e-02, 1.32194115e-02, 1.00000000e-02])},
             scoring='r2', verbose=1)

In [71]:
best_model = gridsearch.best_estimator_
best_model.fit(X_train_std,y_train)

print('R2 Train', r2_score(y_train,best_model.predict(X_train_std)))
print('R2 Test', r2_score(y_test,best_model.predict(X_test_std)))

R2 Train 0.5005517473001011
R2 Test 0.42375938999439855
