# Modeling: try PLS Regression

## Import packages

In [7]:
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.display import display
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression, PLSSVD

from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_validate
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn import preprocessing, svm
from sklearn.metrics import r2_score, accuracy_score, mean_absolute_error

#Use to ignore convergence warnings
import warnings
from sklearn.exceptions import DataConversionWarning
from sklearn.exceptions import ConvergenceWarning
from sklearn.exceptions import FitFailedWarning
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import __version__ as sklearn_version
import datetime

import joblib

warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
warnings.filterwarnings(action='ignore', category=FitFailedWarning)


# pd.set_option('display.max_columns', None)
# pd.reset_option('max_rows')
# np.set_printoptions(threshold=sys.maxsize)

plt.style.use('dark_background')
plt.rcParams.update({"grid.linewidth":0.5, "grid.alpha":0.5})
sns.set(style='ticks', context='talk')

## Load X and y data sets

In [8]:
# import X and y training and test sets

X = pd.read_csv('../../data/train_test/X_alt')
y = pd.read_csv('../../data/train_test/y_alt')

X_train = pd.read_csv('../../data/train_test/X_train_74_26')

X_test = pd.read_csv('../../data/train_test/X_test_74_26')

y_train = pd.read_csv('../../data/train_test/y_train_alt_74_26')

y_test = pd.read_csv('../../data/train_test/y_test_alt+74_26')

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.26, random_state=42)

In [10]:
# pls_pipe = Pipeline([('pls', PLSRegression()), ()])

pls = PLSRegression()

In [11]:
pls.get_params()


{'copy': True, 'max_iter': 500, 'n_components': 2, 'scale': True, 'tol': 1e-06}

In [12]:
param_grid = {
    'n_components':np.arange(1, 60),
    'scale':[True, False]
    # 'max_iter':
}

In [13]:
pls_grid = GridSearchCV(pls, param_grid, cv=5)
pls_grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=PLSRegression(),
             param_grid={'n_components': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59]),
                         'scale': [True, False]})

In [15]:
pls_grid.best_params_, pls_grid.best_score_

({'n_components': 1, 'scale': True}, 0.1282974683034633)

from this base estimate of our training score, intution says that pls will  do as well as pcr which is not that well


In [17]:
range = [int(n) for n in np.logspace(start=1, stop=10, num=20, base=2)]
param_grid = {
    'n_components':np.arange(1, 60),
    'scale':[True],
    'max_iter':range
}
print(range)

[2, 2, 3, 5, 7, 10, 14, 19, 27, 38, 53, 74, 102, 142, 198, 275, 382, 531, 737, 1024]


In [19]:
pls_grid2 = GridSearchCV(pls, param_grid, cv=5)
pls_grid2.fit(X_train, y_train)

In [None]:
pls_grid2.best_params_, pls_grid2.best_score_