In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

import sklearn as sk
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score, KFold, train_test_split
from sklearn.pipeline import Pipeline

import warnings

In [2]:
energyFrame = pd.read_csv("UCI_data.csv")
energy = energyFrame.values

In [3]:
# split the data
X = energy[:, 1:(energy.shape[1]-1)]
y = energy[:, -1]

In [4]:
def search_results(bestModel):
    best = bestModel.best_index_
    results = bestModel.cv_results_

    train_scores = [results['split0_train_score'][best],
                    results['split1_train_score'][best],
                    results['split2_train_score'][best],
                    results['split3_train_score'][best],
                    results['split3_train_score'][best]]

    test_scores = [results['split0_test_score'][best],
                   results['split1_test_score'][best],
                   results['split2_test_score'][best],
                   results['split3_test_score'][best],
                   results['split3_test_score'][best]]


    print("BEST ESTIMATOR: {}".format(results['params'][best]))
    print()
    for i in range(0,5,1):
        print("Fold {}: (Train MSE: {:.4f}, test MSE: {:.4f})".format(i+1,
                                                                      train_scores[i],
                                                                      test_scores[i]))
    print()
    print("Mean train score: {}".format(results['mean_train_score'][best]))
    print("Std train score: {}".format(results['std_train_score'][best]))
    print()
    print("Mean test score: {}".format(results['mean_test_score'][best]))
    print("Std test score: {}".format(results['std_test_score'][best]))
    print()

In [5]:
def cv_results(scores):
    print("OVERALL CROSS-VALIDATED SCORE\n")
    for i,score in enumerate(list(scores)):
        print("Fold {} cross-val score: {:.4f}".format(i+1,score))
    print()
    print("Mean cross-val. score: {}".format(np.mean(scores)))

In [6]:
# Parameter search values

polyParam = list(range(1,4)) # polynomials of degree 1,2,3,4
alphaParam = [0.01, 0.1, 0.5, 1, 5, 10] # regularisation parameters

In [7]:
# Nested cross-validation to find best parameters for Lasso regression
warnings.simplefilter("ignore")

inner_cv = KFold(n_splits=5, shuffle=True, random_state=1234)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=1234)

polyLassoModel = Pipeline([("poly", PolynomialFeatures(include_bias=True)),
                           ("scaler", MinMaxScaler()),
                           ("lasso", linear_model.Lasso())])

searchParam = dict(poly__degree=polyParam,
                   lasso__alpha=alphaParam)

bestLassoModel = RandomizedSearchCV(estimator=polyLassoModel,
                                    param_distributions=searchParam,
                                    scoring="neg_mean_squared_error",
                                    cv=inner_cv,
                                    refit=True,
                                    return_train_score=True,
                                    n_iter=10, n_jobs=-1, verbose=2)

# Get search results
bestLassoModel.fit(X,y)
search_results(bestLassoModel)

lasso_cv_scores = cross_val_score(bestLassoModel, X, y, cv=outer_cv)
cv_results(lasso_cv_scores)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  9.7min finished


BEST ESTIMATOR: {'poly__degree': 2, 'lasso__alpha': 0.01}

Fold 1: (Train MSE: -8711.2749, test MSE: -8917.1478)
Fold 2: (Train MSE: -8877.9468, test MSE: -8160.6759)
Fold 3: (Train MSE: -8695.9843, test MSE: -8940.3905)
Fold 4: (Train MSE: -8747.0708, test MSE: -8719.7419)
Fold 5: (Train MSE: -8747.0708, test MSE: -8719.7419)

Mean train score: -8725.92969312208
Std train score: 90.76870829671266

Mean test score: -8817.526522206412
Std test score: 387.09639793606215

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  4.5min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  7.7min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  3.6min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   56.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   59.5s finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  8.1min finished


OVERALL CROSS-VALIDATED SCORE

Fold 1 cross-val score: -9243.3044
Fold 2 cross-val score: -7913.2424
Fold 3 cross-val score: -8940.3905
Fold 4 cross-val score: -8719.7419
Fold 5 cross-val score: -9054.8228

Mean cross-val. score: -8774.300401542676
