### Split: Train, Validation, Test Sets

In [2]:
# Read Back
import pickle

with open('datasetIPPD.pickle', 'rb') as handle:
    data = pickle.load(handle)

In [3]:
data.keys()

dict_keys(['X_train', 'y_train', 'X_test', 'y_test'])

In [160]:
X_train = data['X_train']
y_train = data['y_train']

X_test = data['X_test']
y_test = data['y_test']

In [None]:
LassoCVModel('datasetIPPD.pickle')

Dataset size read: train 256 and test 65 
Search Space (# of alphas) : 20 
model.fit(X_train, y_train)...




In [170]:
def LassoCVModel(filename):
    #open file and get the dictionary
    import pickle
    from sklearn.linear_model import LassoCV
    from sklearn.metrics import mean_squared_error 

    with open(filename, 'rb') as handle:
        data = pickle.load(handle)

    #extract X_train, y_train, X_test, t_test
    X_train = data['X_train']
    y_train = data['y_train']
    X_test = data['X_test']
    y_test = data['y_test']
    print("Dataset size read: train %d and test %d " %(len(y_train), len(y_test)))
    
    #Normalize
    from sklearn import preprocessing
    X_train = preprocessing.normalize(X_train, norm='l1')
    X_test  = preprocessing.normalize(X_test,  norm='l1')
    
    alphas = np.logspace(-25, -.5, 20)
    print("Search Space (# of alphas) : %d " % len(alphas))
    model = LassoCV(alphas=alphas, random_state=0, eps=.001, n_alphas=100, fit_intercept=True, 
                   normalize=False, precompute='auto', max_iter=1000000, 
                   tol=0.0001, copy_X=True, cv=None, verbose=False, n_jobs=-1, 
                   positive=False, selection='random')

    print("model.fit(X_train, y_train)...")
    model.fit(X_train, y_train)
    
    pos = np.where(alphas == model.alpha_)
    idx = pos[0][0]
    mse_optimal = model.mse_path_[idx]
    mean_rmse_cv   = np.sqrt(mse_optimal).mean()

    print("mse_optimal %s \n" % mse_optimal)

    # Model's performance based CV (compare this with others for model selection)
    print("Mean of RMSE on %d fold CV: %f" % (len(mse_optimal), mean_rmse_cv))

    #Reporting Score on Test Set
    y_predict = model.predict(X_test)
    reporting_testscore = np.sqrt(mean_squared_error(y_test, y_predict))
    
    return {filename: {'mean_rmse_cv': mean_rmse_cv, 'reporting_testscore':reporting_testscore }}



# Normalize X_train and X_test

In [67]:
from sklearn import preprocessing

X_train = preprocessing.normalize(X_train, norm='l1')
X_test  = preprocessing.normalize(X_test,  norm='l1')

<h1> Model 1 : LassoCV </h1>

In [128]:
from __future__ import print_function
print(__doc__)

import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

alphas = np.logspace(-25, -.5, 20)

Automatically created module for IPython interactive environment


In [97]:
alphas

array([  1.00000000e-20,   1.06246783e-19,   1.12883789e-18,
         1.19935395e-17,   1.27427499e-16,   1.35387618e-15,
         1.43844989e-14,   1.52830673e-13,   1.62377674e-12,
         1.72521055e-11,   1.83298071e-10,   1.94748304e-09,
         2.06913808e-08,   2.19839265e-07,   2.33572147e-06,
         2.48162892e-05,   2.63665090e-04,   2.80135676e-03,
         2.97635144e-02,   3.16227766e-01])

In [129]:
model = LassoCV(alphas=alphas, random_state=0, eps=.001, n_alphas=100, fit_intercept=True, 
                   normalize=False, precompute='auto', max_iter=1000000, 
                   tol=0.0001, copy_X=True, cv=None, verbose=False, n_jobs=-1, 
                   positive=False, selection='random')

In [130]:
%time model.fit(X_train, y_train)



CPU times: user 3min 58s, sys: 58.3 ms, total: 3min 58s
Wall time: 1min 37s


LassoCV(alphas=array([  1.00000e-25,   1.94748e-24,   3.79269e-23,   7.38620e-22,
         1.43845e-20,   2.80136e-19,   5.45559e-18,   1.06247e-16,
         2.06914e-15,   4.02961e-14,   7.84760e-13,   1.52831e-11,
         2.97635e-10,   5.79639e-09,   1.12884e-07,   2.19839e-06,
         4.28133e-05,   8.33782e-04,   1.62378e-02,   3.16228e-01]),
    copy_X=True, cv=None, eps=0.001, fit_intercept=True, max_iter=1000000,
    n_alphas=100, n_jobs=-1, normalize=False, positive=False,
    precompute='auto', random_state=0, selection='random', tol=0.0001,
    verbose=False)

In [131]:
print("[alpha: {0:0.20f}, score: {1:.5f}".
          format(model.alpha_, model.score(X_test, y_test)))

[alpha: 0.00000000000000206914, score: 0.70210


In [132]:
# MSE for each alpha 
# You can extract mean MSE of the *optimal combination from mse_path_
# This mse should be compared with other models, for model selection !

In [133]:
# Chosen Alpha
model.alpha_

2.0691380811147901e-15

In [134]:
#position of alpha that was picked (optimal)
pos = np.where(alphas == model.alpha_)
pos

(array([8]),)

In [135]:
#extract the index
idx = pos[0][0]
idx

8

In [146]:
# Get MSE of this index corresponding to optimal alpha

# mse_path_ : array, shape (n_alphas, n_folds)

mse_optimal = model.mse_path_[idx]

print("mse_optimal %s \n" % mse_optimal)

# Lasso's performance based on 3 fold CV (compare this with others for model selection)
print("Mean of RMSE on %d fold CV: %f" % (len(mse_optimal), np.sqrt(mse_optimal).mean()))

mse_optimal [ 361578.91094663  305504.14989661  309913.57045708] 

Mean of RMSE on 3 fold CV: 570.245811


################################################################################
################################################################################
############################  R E P O R T I N G #####################################
################################################################################
################################################################################

In [147]:
from sklearn.metrics import mean_squared_error 

y_predict = model.predict(X_test) #Reduce X to the selected features and then predict using the underlying estimator.
rmse_model = np.sqrt(mean_squared_error(y_test, y_predict))
print("(for reporting only) Test Error once optimal model is picked by CV : %f" % rmse_model)

(for reporting only) Test Error once optimal model is picked by CV : 478.491177
