In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

import sklearn as sk
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score, KFold, train_test_split
from sklearn.pipeline import Pipeline

import warnings

In [2]:
energyFrame = pd.read_csv("UCI_data.csv")
energy = energyFrame.values

In [3]:
# split the data
X = energy[:, 1:(energy.shape[1]-1)]
y = energy[:, -1]

In [4]:
def search_results(bestModel):
    best = bestModel.best_index_
    results = bestModel.cv_results_

    train_scores = [results['split0_train_score'][best],
                    results['split1_train_score'][best],
                    results['split2_train_score'][best],
                    results['split3_train_score'][best],
                    results['split3_train_score'][best]]

    test_scores = [results['split0_test_score'][best],
                   results['split1_test_score'][best],
                   results['split2_test_score'][best],
                   results['split3_test_score'][best],
                   results['split3_test_score'][best]]


    print("BEST ESTIMATOR: {}".format(results['params'][best]))
    print()
    for i in range(0,5,1):
        print("Fold {}: (Train MSE: {:.4f}, test MSE: {:.4f})".format(i+1,
                                                                      train_scores[i],
                                                                      test_scores[i]))
    print()
    print("Mean train score: {}".format(results['mean_train_score'][best]))
    print("Std train score: {}".format(results['std_train_score'][best]))
    print()
    print("Mean test score: {}".format(results['mean_test_score'][best]))
    print("Std test score: {}".format(results['std_test_score'][best]))
    print()

In [5]:
def cv_results(scores):
    print("OVERALL CROSS-VALIDATED SCORE\n")
    for i,score in enumerate(list(scores)):
        print("Fold {} cross-val score: {:.4f}".format(i+1,score))
    print()
    print("Mean cross-val. score: {}".format(np.mean(scores)))

In [8]:
# Parameter search values

polyParam = list(range(1,4)) # polynomials of degree 1,2,3,4
alphaParam = [0.01, 0.1, 0.5, 1, 5, 10] # regularisation parameters

In [9]:
# Nested cross-validation to find best parameters for Lasso regression
warnings.simplefilter("ignore")

inner_cv = KFold(n_splits=5, shuffle=True, random_state=1234)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=1234)

polyLassoModel = Pipeline([("poly", PolynomialFeatures(include_bias=True)),
                           ("scaler", MinMaxScaler()),
                           ("lasso", linear_model.Lasso())])

searchParam = dict(poly__degree=polyParam,
                   lasso__alpha=alphaParam)

bestLassoModel = RandomizedSearchCV(estimator=polyLassoModel,
                                    param_distributions=searchParam,
                                    scoring="neg_mean_squared_error",
                                    cv=inner_cv,
                                    refit=True,
                                    return_train_score=True,
                                    n_iter=10, n_jobs=-1, verbose=10)

# Get search results
bestLassoModel.fit(X,y)
search_results(bestLassoModel)

lasso_cv_scores = cross_val_score(bestLassoModel, X, y, cv=outer_cv)
cv_results(lasso_cv_scores)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   22.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   24.8s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  5.2min finished


BEST ESTIMATOR: {'poly__degree': 3, 'lasso__alpha': 0.01}

Fold 1: (Train MSE: -8370.9526, test MSE: -8595.5709)
Fold 2: (Train MSE: -8518.0900, test MSE: -7913.2424)
Fold 3: (Train MSE: -8322.8588, test MSE: -8699.4224)
Fold 4: (Train MSE: -8397.4305, test MSE: -8455.3829)
Fold 5: (Train MSE: -8397.4305, test MSE: -8455.3829)

Mean train score: -8373.612213020535
Std train score: 86.28124365038413

Mean test score: -8543.68826438694
Std test score: 372.4146459874614

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  4.3min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  3.5min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   39.3s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  4.8min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   40.8s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  5.2min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  5.6min finished


OVERALL CROSS-VALIDATED SCORE

Fold 1 cross-val score: -8595.5709
Fold 2 cross-val score: -8513.5626
Fold 3 cross-val score: -8699.4224
Fold 4 cross-val score: -8455.3829
Fold 5 cross-val score: -9054.8228

Mean cross-val. score: -8663.752304536669
