In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn import metrics
%matplotlib inline

**When we don't have the validation dataset**

In [2]:
df = pd.read_csv('/content/drive/MyDrive/IST347/Week_6/Advertising.csv')
X = df.drop('sales',axis=1)
y = df.sales

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state =101)
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train) # it is same as scaler.fit(X_train) first, then do the scaler.transform(X_test)
scaled_X_test = scaler.transform(X_test);

Test with first alpha

In [4]:
model = Ridge(alpha=100)
model.fit(scaled_X_train, y_train)

Ridge(alpha=100, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [5]:
y_pred = model.predict(scaled_X_test)

In [6]:
metrics.mean_squared_error(y_test, y_pred)

7.34177578903413

Because of the high error, test with a different alpha

In [7]:
model2 = Ridge(alpha=1)

In [8]:
model2.fit(scaled_X_train,y_train)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='auto', tol=0.001)

In [9]:
y_pred2 = model2.predict(scaled_X_test)

In [10]:
metrics.mean_squared_error(y_test, y_pred2)

2.319021579428752

In this case, when we see the model1 creates a 7.34 error rate, we felt it's too high. Then we changed the alpha to 1 to try if there's a lower error. Therefore, some people would argue that you create a data leakage because you test the test dataset to get the lowest error. That's why we need to create a validation dataset before we apply the model to test dataset.

**Create the validation dataset before test**

In [11]:
df2 = pd.read_csv('/content/drive/MyDrive/IST347/Week_6/Advertising.csv')
X2 = df2.drop('sales',axis=1)
y2 = df2.sales

To create a validation dataset, we only need to do train_test_split twice

In [12]:
X_train_v, X_rem, y_train_v, y_rem = train_test_split(X2, y2,test_size = 0.3, random_state =101) #break dataset to 70-30 (train & remain)

In [13]:
X_valid, X_test_v, y_valid, y_test_v= train_test_split(X_rem, y_rem,test_size = 0.5, random_state =101) # break remain 50-50

In [14]:
#check the len of different dataset
# percentage should be [1, 0.7, 0.15,0.15]
L = [len(X2), len(X_train_v), len(X_valid), len(X_test_v)]
print(L)

[200, 140, 30, 30]


In [15]:
# create a scaler function
scaler_v = StandardScaler()

In [16]:
# scale three training dataset
scaled_X_train_v = scaler_v.fit_transform(X_train_v) #only use scale from X_train, then transform all datasets
scaled_X_valid = scaler_v.transform(X_valid)
scaled_X_test_v = scaler_v.transform(X_test_v)

Fit training set with first alpha

In [17]:
model_v1 = Ridge(alpha=100)
model_v1.fit(scaled_X_train_v, y_train_v)

Ridge(alpha=100, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

Predict on **Validation** dataset

In [18]:
y_pred_valid_v1 = model_v1.predict(scaled_X_valid)
metrics.mean_squared_error(y_valid,y_pred_valid_v1)

7.320101458823871

Fit training set with second alpha

In [19]:
model_v2 = Ridge(alpha=1)
model_v2.fit(scaled_X_train_v, y_train_v)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='auto', tol=0.001)

Predict on **Validation** dataset

In [20]:
y_pred_valid_v2 = model_v2.predict(scaled_X_valid)
metrics.mean_squared_error(y_valid,y_pred_valid_v2)

2.383783075056986

In this case, let's assume we are happy with the result from the validation dataset. Then, we can apply this model to the test dataset, which never being seen.

**Apply the prediction model on the test dataset**

In [21]:
y_pred_final = model_v2.predict(scaled_X_test_v)
metrics.mean_squared_error(y_test_v,y_pred_final)

2.2542600838005176

Now, we can say this prediction is good because no one see the test data until we do it in line 23! 

**K-Fold Cross validation**

To fairly train on the training dataset, not only some part of it by random selection, we should to a cross validation

K-Fold model 1, alpha = 100

In [22]:
model_KF_1 = Ridge(alpha=100)

In [23]:
score_KF_1 = cross_val_score(model_KF_1, scaled_X_train_v, y_train_v, cv = 5, scoring = 'neg_mean_squared_error' )

In [24]:
abs(score_KF_1.mean())

8.215396464543606

K-Fold model 2, alpha = 100

In [25]:
model_KF_2 = Ridge(alpha=1)

In [26]:
score_KF_2 = cross_val_score(model_KF_2, scaled_X_train_v, y_train_v, cv = 5, scoring = 'neg_mean_squared_error' )

In [27]:
abs(score_KF_2.mean()) 

3.344839296530695

If you are happy with the model cross validation, fit it to train dataset


In [28]:
#Because KF model never fit the dataset before. Fit dataset to the final model, then predict on test dataset
model_KF_2.fit(scaled_X_train_v, y_train_v)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='auto', tol=0.001)

In [29]:
y_pred_final_2 = model_KF_2.predict(scaled_X_test_v)
metrics.mean_squared_error(y_test_v, y_pred_final_2)

2.2542600838005176

**Cross Validation**

In [30]:
model_CV = Ridge(alpha = 100)

In [31]:
scores_KV = cross_validate(model_CV, scaled_X_train_v, y_train_v,  scoring=['neg_mean_squared_error','neg_mean_absolute_error'])

In [32]:
scores_KV

{'fit_time': array([0.00206518, 0.00177288, 0.00082994, 0.00196195, 0.00162578]),
 'score_time': array([0.00430202, 0.00140119, 0.00061393, 0.00201726, 0.00060964]),
 'test_neg_mean_absolute_error': array([-2.31243044, -1.74653361, -2.56211701, -2.01873159, -2.27951906]),
 'test_neg_mean_squared_error': array([ -9.32552967,  -4.9449624 , -11.39665242,  -7.0242106 ,
         -8.38562723])}

In [33]:
score = pd.DataFrame(data= scores_KV)
score

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error
0,0.002065,0.004302,-9.32553,-2.31243
1,0.001773,0.001401,-4.944962,-1.746534
2,0.00083,0.000614,-11.396652,-2.562117
3,0.001962,0.002017,-7.024211,-2.018732
4,0.001626,0.00061,-8.385627,-2.279519
