#### Cross Validation

The **cross_val_score** function uses a model and training set (along with a K and chosen metric) to perform all of this for us automatically !

This allows for K-Fold cross validation to be performed on any model . 

In [13]:
import numpy as np 
import pandas as pd  
import matplotlib.pyplot as plt 
import seaborn as sns 

In [14]:
df = pd.read_csv("../DATA/Advertising.csv")
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [15]:
X = df.drop('sales',axis=1)
y = df['sales']

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X_train , X_test , y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=101)

In [18]:
from sklearn.preprocessing import StandardScaler

In [19]:
scaler = StandardScaler()

In [20]:
scaler.fit(X_train)

In [21]:
X_train = scaler.transform(X_train)

In [22]:
X_test = scaler.transform(X_test)

In [25]:
from sklearn.linear_model import Ridge
model = Ridge(alpha=100)

In [26]:
from sklearn.model_selection import cross_val_score

In [28]:
scores = cross_val_score(model,X_train,y_train,scoring='neg_mean_squared_error',cv=5)

In [29]:
scores

array([ -9.32552967,  -4.9449624 , -11.39665242,  -7.0242106 ,
        -8.38562723])

In [30]:
abs(scores.mean())

8.215396464543607

Try again with changing the alpha value

In [31]:
model = Ridge(alpha=1)
scores = cross_val_score(model,X_train,y_train,scoring='neg_mean_squared_error',cv=5)

In [32]:
abs(scores.mean())

3.344839296530695

In [33]:
model.fit(X_train,y_train)

In [34]:
y_final_test_pred = model.predict(X_test)

In [35]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,y_final_test_pred)

2.3190215794287514

## cross_validate 
#### Using the cross_validate function

The **cross_validate** function allows us to view multiple performance metrics from cross validation on a model and explore how much time fitting and testing took 

In [37]:
## CREATE X and y

X = df.drop('sales',axis=1)
y = df['sales']

# TRAIN TEST SPLIT 

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=101)

# SCALE DATA 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


In [38]:
from sklearn.model_selection import cross_validate

In [39]:
model = Ridge(alpha=100)

In [41]:
scores = cross_validate(model,X_train,y_train,scoring=['neg_mean_squared_error','neg_mean_absolute_error'],cv=10)

In [42]:
scores

{'fit_time': array([0.00093794, 0.00054789, 0.0005331 , 0.00063109, 0.00062585,
        0.00046206, 0.00044394, 0.000458  , 0.00078106, 0.00141215]),
 'score_time': array([0.00069404, 0.00040698, 0.00045991, 0.00041604, 0.00039601,
        0.00036478, 0.00037503, 0.00038791, 0.00046611, 0.00053096]),
 'test_neg_mean_squared_error': array([ -6.06067062, -10.62703078,  -3.99342608,  -5.00949402,
         -9.14179955, -13.08625636,  -3.83940454,  -9.05878567,
         -9.05545685,  -5.77888211]),
 'test_neg_mean_absolute_error': array([-1.8102116 , -2.54195751, -1.46959386, -1.86276886, -2.52069737,
        -2.45999491, -1.45197069, -2.37739501, -2.44334397, -1.89979708])}

In [43]:
scores = pd.DataFrame(scores)

In [44]:
scores

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error
0,0.000938,0.000694,-6.060671,-1.810212
1,0.000548,0.000407,-10.627031,-2.541958
2,0.000533,0.00046,-3.993426,-1.469594
3,0.000631,0.000416,-5.009494,-1.862769
4,0.000626,0.000396,-9.1418,-2.520697
5,0.000462,0.000365,-13.086256,-2.459995
6,0.000444,0.000375,-3.839405,-1.451971
7,0.000458,0.000388,-9.058786,-2.377395
8,0.000781,0.000466,-9.055457,-2.443344
9,0.001412,0.000531,-5.778882,-1.899797


In [45]:
scores.mean()

fit_time                        0.000683
score_time                      0.000450
test_neg_mean_squared_error    -7.565121
test_neg_mean_absolute_error   -2.083773
dtype: float64

In [46]:
model = Ridge(alpha=1)

In [47]:
scores = cross_validate(model,X_train,y_train,scoring=['neg_mean_squared_error','neg_mean_absolute_error'],cv=10)
scores = pd.DataFrame(scores)

In [48]:
scores

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error
0,0.001035,0.000586,-2.962508,-1.457174
1,0.000946,0.003545,-3.057378,-1.555308
2,0.008366,0.005539,-2.17374,-1.23877
3,0.007034,0.020898,-0.833034,-0.768938
4,0.000542,0.000331,-3.464018,-1.434489
5,0.000491,0.000522,-8.232647,-1.494316
6,0.000434,0.000345,-1.905864,-1.081362
7,0.000386,0.000289,-2.765048,-1.250011
8,0.000461,0.000648,-4.989505,-1.580971
9,0.000479,0.002427,-2.846438,-1.223326


In [49]:
scores.mean()

fit_time                        0.002017
score_time                      0.003513
test_neg_mean_squared_error    -3.323018
test_neg_mean_absolute_error   -1.308467
dtype: float64

In [50]:
model.fit(X_train,y_train)

In [51]:
y_final_pred = model.predict(X_test)

In [52]:
mean_squared_error(y_test,y_final_pred)

2.3190215794287514