In [1]:
# automatic nested cross-validation for random forest on a classification dataset
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.linear_model import ElasticNetCV
import itertools
import statsmodels.api as sm
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_regression
from sklearn.pipeline import Pipeline
from sklearn import linear_model
import numpy as np
from numpy import arange
from sklearn.model_selection import RepeatedKFold
import sys
import os
import matplotlib.pyplot as plt
from sklearn.linear_model import ElasticNet
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    warnings.filterwarnings('ignore')
    
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error 
import matplotlib.pyplot as plt
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

from random import sample
import random
random_st = random.sample(list(np.arange(0,10,1)),1)[0]
print(random_st)

9


In [2]:
#!/usr/bin/python
# -*- coding: utf-8 -*-

# ------------------------------------
# file: zca.py
# date: Thu May 21 15:47 2015
# author:
# Maarten Versteegh
# github.com/mwv
# maartenversteegh AT gmail DOT com
#
# Licensed under GPLv3
# ------------------------------------
"""zca: ZCA whitening with a sklearn-like interface

"""

from __future__ import division

import numpy as np
from scipy import linalg

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.utils.validation import check_is_fitted
from sklearn.utils import check_array, as_float_array

class ZCA(BaseEstimator, TransformerMixin):
    def __init__(self, regularization=1e-6, copy=False):
        self.regularization = regularization
        self.copy = copy

    def fit(self, X, y=None):
        """Compute the mean, whitening and dewhitening matrices.

        Parameters
        ----------
        X : array-like with shape [n_samples, n_features]
            The data used to compute the mean, whitening and dewhitening
            matrices.
        """
        X = check_array(X, accept_sparse=None, copy=self.copy,
                        ensure_2d=True)
        X = as_float_array(X, copy=self.copy)
        self.mean_ = X.mean(axis=0)
        X_ = X - self.mean_
        cov = np.dot(X_.T, X_) / (X_.shape[0]-1)
        U, S, _ = linalg.svd(cov)
        s = np.sqrt(S.clip(self.regularization))
        s_inv = np.diag(1./s)
        s = np.diag(s)
        self.whiten_ = np.dot(np.dot(U, s_inv), U.T)
        self.dewhiten_ = np.dot(np.dot(U, s), U.T)
        return self

    def transform(self, X, y=None, copy=None):
        """Perform ZCA whitening

        Parameters
        ----------
        X : array-like with shape [n_samples, n_features]
            The data to whiten along the features axis.
        """
        check_is_fitted(self, 'mean_')
        X = as_float_array(X, copy=self.copy)
        return np.dot(X - self.mean_, self.whiten_.T)

    def inverse_transform(self, X, copy=None):
        """Undo the ZCA transform and rotate back to the original
        representation

        Parameters
        ----------
        X : array-like with shape [n_samples, n_features]
            The data to rotate back.
        """
        check_is_fitted(self, 'mean_')
        X = as_float_array(X, copy=self.copy)
        return np.dot(X, self.dewhiten_) + self.mean_


In [3]:
def read_data():
    df = pd.read_csv("https://raw.githubusercontent.com/thistleknot/Python-Stock/master/data/raw/states.csv").set_index('States')
    return(df)

In [4]:
independent = 'Poverty'
outer_k = 10
inner_k = 10

In [5]:
# create dataset
y = read_data()[[independent]]
X = read_data()[(read_data().columns).difference([independent]).values]


In [6]:
#elastic

# configure the cross-validation procedure
cv_inner = KFold(n_splits=inner_k, shuffle=True, random_state=random_st)
# define the model

estimators_ENetCV = []
estimators_ENetCV.append(('standardize', ZCA()))
estimators_ENetCV.append(('ElasticNetCV', ElasticNetCV(cv=cv_inner, random_state=random_st,fit_intercept=1)))

# define search
search_en = Pipeline(estimators_ENetCV)

# configure the cross-validation procedure
cv_outer = KFold(n_splits=outer_k, shuffle=True, random_state=random_st)

# execute the nested cross-validation
scores = cross_val_score(search_en, X, y, scoring='neg_mean_squared_error', cv=cv_outer, n_jobs=-1)
# report performance

elastic_score = 'Accuracy: %.3f (%.3f)' % (mean(scores), std(scores))
print(elastic_score)

Accuracy: -2.479 (2.447)


In [7]:
#linear

# configure the cross-validation procedure
cv_inner = KFold(n_splits=inner_k, shuffle=True, random_state=random_st)
# define the model

estimators_linear = []
estimators_linear.append(('standardize', ZCA()))
estimators_linear.append(('Linear', ElasticNetCV(cv=cv_inner,alphas=[0], l1_ratio=0,fit_intercept = True)))

# define search space
# define search
search = Pipeline(estimators_linear)
# configure the cross-validation procedure
cv_outer = KFold(n_splits=outer_k, shuffle=True, random_state=random_st)
# execute the nested cross-validation
scores = cross_val_score(search, X, y, scoring='neg_mean_squared_error', cv=cv_outer, n_jobs=-1)
# report performance
Linear_score = 'Accuracy: %.3f (%.3f)' % (mean(scores), std(scores))
print(Linear_score)

Accuracy: -2.472 (2.399)
