# Polynomial Regression
Outline:
1. Data Import
2. Modeling
3. RS, TPOT
4. BO, GS
5. Postmodeling & others

# Data & Library Import

In [1]:
# Import libraries
import pickle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time

In [2]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
# Modeling related libraries
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [3]:
# Import Training dataset  
X_train_dev = pd.read_csv('Dataset for metamodeling/X_train_dev.csv', header=0).copy()
X_val = pd.read_csv('Dataset for metamodeling/X_val.csv', header=0).copy()
y_train_dev = pd.read_csv('Dataset for metamodeling/y_train_dev.csv', header=0).copy()
y_val = pd.read_csv('Dataset for metamodeling/y_val.csv', header=0).copy()

In [4]:
y_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8320 entries, 0 to 8319
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   $volume_{oo.15min}$   8320 non-null   float64
 1   $volume_{oo.60min}$   8320 non-null   float64
 2   $volume_{oo.total}$   8320 non-null   float64
 3   $height_{damage.ih}$  8320 non-null   float64
 4   $length_{damage.ih}$  8320 non-null   float64
 5   $height_{damage.oh}$  8320 non-null   float64
 6   $length_{damage.oh}$  8320 non-null   float64
dtypes: float64(7)
memory usage: 455.1 KB


In [5]:
# Initial modeling
pr_1 = make_pipeline(PolynomialFeatures(degree=1),
                      LinearRegression())
pr_1.fit(X_train_dev,y_train_dev)
y_pred = pr_1.predict(X_val)

# Metrics
from sklearn.metrics import r2_score, mean_squared_error

r2_pr = r2_score(y_val, y_pred)
rmse_pr = mean_squared_error(y_val, y_pred, squared=True)

print('r2_pr, rmse_pr', r2_pr, rmse_pr)


r2_pr, rmse_pr 0.35661629962902125 0.030805240515907637


# Random Search

In [6]:
print(PolynomialFeatures().get_params())
print(LinearRegression().get_params())

{'degree': 2, 'include_bias': True, 'interaction_only': False, 'order': 'C'}
{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': False}


In [7]:
# Develop RandomSearchCV for the DNN
# Set up the sample space for Random Search
parameter_grid = { 'degree' : list(range(1, 10, 1))}#,
#'hidden_layer_sizes' : list(range(1, 60, 5)) }      # range(start, stop, step) #(start=1,stop=1000, num=100))
# Define how many samples
number_models = 10

In [None]:
# ------------------
# Random try is enough (since search space is small) 
# we dont need Random search from sklearn!
degree=2
polyreg = make_pipeline(PolynomialFeatures(degree, order='F'),
                      LinearRegression(fit_intercept=True))
polyreg.fit(X_train_dev,y_train_dev)
y_pred = polyreg.predict(X_val)
r2_pr = r2_score(y_val, y_pred)
print('r2_pr', r2_pr)
rmse_pr = mean_squared_error(y_val, y_pred, squared=True)

# Grid Search

In [11]:
# 04.05.2021 Tanmoy at 1pm
model = MultiOutputRegressor(make_pipeline(PolynomialFeatures(),
                      LinearRegression()))
hyperparameters = dict(estimator__polynomialfeatures__degree=[3,4], # 5 will take enormous amount of time
                    estimator__polynomialfeatures__order =['F', 'C'],
                       estimator__polynomialfeatures__interaction_only = [True, False],
                       estimator__polynomialfeatures__include_bias = [True, False],
                      estimator__linearregression__fit_intercept = [True, False])
#number_models = 10
grid_search_model_pr = GridSearchCV(model, hyperparameters, #n_iter=number_models,  #random_state=0, 
                                                  scoring=['r2', 'neg_mean_squared_error', 'neg_median_absolute_error'],
                                                   n_jobs=1, refit=False, cv=2, verbose=5, #n_jobs=1 to get Verbose output (runtime of each model) for each iteration
                                                   pre_dispatch='2*n_jobs', error_score='raise', return_train_score=True)
grid_search_model_pr = grid_search_model_pr.fit(X_train_dev, y_train_dev)
# Calculate scores on validation training set +++ 
degree = [item['estimator__polynomialfeatures__degree'] for item in grid_search_model_pr.cv_results_['params']]
order = [item['estimator__polynomialfeatures__order'] for item in grid_search_model_pr.cv_results_['params']]
interaction_only = [item['estimator__polynomialfeatures__interaction_only'] for item in grid_search_model_pr.cv_results_['params']]
include_bias = [item['estimator__polynomialfeatures__include_bias'] for item in grid_search_model_pr.cv_results_['params']]
intercept = [item['estimator__linearregression__fit_intercept'] for item in grid_search_model_pr.cv_results_['params']]

r2_pr = list(grid_search_model_pr.cv_results_['mean_test_r2'])
neg_mean_squared_error_pr = list(grid_search_model_pr.cv_results_['mean_test_neg_mean_squared_error'])
neg_median_absolute_error_pr = list(grid_search_model_pr.cv_results_['mean_test_neg_median_absolute_error'])
df_grids_pr_perfm = pd.DataFrame([degree, order, interaction_only, include_bias, intercept,
                                    r2_pr, neg_mean_squared_error_pr, neg_median_absolute_error_pr]).T
df_grids_pr_perfm.columns = ['degree','order', 'interaction_only', 'include_bias','intercept'
                               'r2_pr', 'neg_mean_squared_error_pr','neg_median_absolute_error_pr']
# df_rands_dnn_perfm.groupby(['Learning Rate']).mean()
display(df_grids_pr_perfm)

Fitting 2 folds for each of 32 candidates, totalling 64 fits
[CV] estimator__linearregression__fit_intercept=True, estimator__polynomialfeatures__degree=3, estimator__polynomialfeatures__include_bias=True, estimator__polynomialfeatures__interaction_only=True, estimator__polynomialfeatures__order=F 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  estimator__linearregression__fit_intercept=True, estimator__polynomialfeatures__degree=3, estimator__polynomialfeatures__include_bias=True, estimator__polynomialfeatures__interaction_only=True, estimator__polynomialfeatures__order=F, neg_mean_squared_error=(train=-0.021, test=-0.022), neg_median_absolute_error=(train=-0.073, test=-0.074), r2=(train=0.547, test=0.533), total=   2.3s
[CV] estimator__linearregression__fit_intercept=True, estimator__polynomialfeatures__degree=3, estimator__polynomialfeatures__include_bias=True, estimator__polynomialfeatures__interaction_only=True, estimator__polynomialfeatures__order=F 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.7s remaining:    0.0s


[CV]  estimator__linearregression__fit_intercept=True, estimator__polynomialfeatures__degree=3, estimator__polynomialfeatures__include_bias=True, estimator__polynomialfeatures__interaction_only=True, estimator__polynomialfeatures__order=F, neg_mean_squared_error=(train=-0.021, test=-0.021), neg_median_absolute_error=(train=-0.075, test=-0.075), r2=(train=0.539, test=0.541), total=   2.4s
[CV] estimator__linearregression__fit_intercept=True, estimator__polynomialfeatures__degree=3, estimator__polynomialfeatures__include_bias=True, estimator__polynomialfeatures__interaction_only=True, estimator__polynomialfeatures__order=C 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.5s remaining:    0.0s


[CV]  estimator__linearregression__fit_intercept=True, estimator__polynomialfeatures__degree=3, estimator__polynomialfeatures__include_bias=True, estimator__polynomialfeatures__interaction_only=True, estimator__polynomialfeatures__order=C, neg_mean_squared_error=(train=-0.021, test=-0.022), neg_median_absolute_error=(train=-0.073, test=-0.074), r2=(train=0.547, test=0.533), total=   3.5s
[CV] estimator__linearregression__fit_intercept=True, estimator__polynomialfeatures__degree=3, estimator__polynomialfeatures__include_bias=True, estimator__polynomialfeatures__interaction_only=True, estimator__polynomialfeatures__order=C 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    9.9s remaining:    0.0s


[CV]  estimator__linearregression__fit_intercept=True, estimator__polynomialfeatures__degree=3, estimator__polynomialfeatures__include_bias=True, estimator__polynomialfeatures__interaction_only=True, estimator__polynomialfeatures__order=C, neg_mean_squared_error=(train=-0.021, test=-0.021), neg_median_absolute_error=(train=-0.075, test=-0.075), r2=(train=0.539, test=0.541), total=   3.4s
[CV] estimator__linearregression__fit_intercept=True, estimator__polynomialfeatures__degree=3, estimator__polynomialfeatures__include_bias=True, estimator__polynomialfeatures__interaction_only=False, estimator__polynomialfeatures__order=F 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   14.1s remaining:    0.0s


[CV]  estimator__linearregression__fit_intercept=True, estimator__polynomialfeatures__degree=3, estimator__polynomialfeatures__include_bias=True, estimator__polynomialfeatures__interaction_only=False, estimator__polynomialfeatures__order=F, neg_mean_squared_error=(train=-0.010, test=-0.010), neg_median_absolute_error=(train=-0.050, test=-0.051), r2=(train=0.788, test=0.781), total=   4.0s
[CV] estimator__linearregression__fit_intercept=True, estimator__polynomialfeatures__degree=3, estimator__polynomialfeatures__include_bias=True, estimator__polynomialfeatures__interaction_only=False, estimator__polynomialfeatures__order=F 
[CV]  estimator__linearregression__fit_intercept=True, estimator__polynomialfeatures__degree=3, estimator__polynomialfeatures__include_bias=True, estimator__polynomialfeatures__interaction_only=False, estimator__polynomialfeatures__order=F, neg_mean_squared_error=(train=-0.010, test=-0.010), neg_median_absolute_error=(train=-0.051, test=-0.051), r2=(train=0.785, tes

[CV]  estimator__linearregression__fit_intercept=True, estimator__polynomialfeatures__degree=4, estimator__polynomialfeatures__include_bias=True, estimator__polynomialfeatures__interaction_only=True, estimator__polynomialfeatures__order=F, neg_mean_squared_error=(train=-0.020, test=-0.020), neg_median_absolute_error=(train=-0.066, test=-0.066), r2=(train=0.571, test=0.569), total=   6.5s
[CV] estimator__linearregression__fit_intercept=True, estimator__polynomialfeatures__degree=4, estimator__polynomialfeatures__include_bias=True, estimator__polynomialfeatures__interaction_only=True, estimator__polynomialfeatures__order=C 
[CV]  estimator__linearregression__fit_intercept=True, estimator__polynomialfeatures__degree=4, estimator__polynomialfeatures__include_bias=True, estimator__polynomialfeatures__interaction_only=True, estimator__polynomialfeatures__order=C, neg_mean_squared_error=(train=-0.019, test=-0.020), neg_median_absolute_error=(train=-0.064, test=-0.066), r2=(train=0.581, test=0

[CV]  estimator__linearregression__fit_intercept=True, estimator__polynomialfeatures__degree=4, estimator__polynomialfeatures__include_bias=False, estimator__polynomialfeatures__interaction_only=False, estimator__polynomialfeatures__order=C, neg_mean_squared_error=(train=-0.005, test=-0.005), neg_median_absolute_error=(train=-0.032, test=-0.034), r2=(train=0.889, test=0.880), total=  24.4s
[CV] estimator__linearregression__fit_intercept=True, estimator__polynomialfeatures__degree=4, estimator__polynomialfeatures__include_bias=False, estimator__polynomialfeatures__interaction_only=False, estimator__polynomialfeatures__order=C 
[CV]  estimator__linearregression__fit_intercept=True, estimator__polynomialfeatures__degree=4, estimator__polynomialfeatures__include_bias=False, estimator__polynomialfeatures__interaction_only=False, estimator__polynomialfeatures__order=C, neg_mean_squared_error=(train=-0.005, test=-0.005), neg_median_absolute_error=(train=-0.032, test=-0.033), r2=(train=0.887, 

[CV]  estimator__linearregression__fit_intercept=False, estimator__polynomialfeatures__degree=3, estimator__polynomialfeatures__include_bias=False, estimator__polynomialfeatures__interaction_only=True, estimator__polynomialfeatures__order=C, neg_mean_squared_error=(train=-0.021, test=-0.021), neg_median_absolute_error=(train=-0.075, test=-0.075), r2=(train=0.539, test=0.541), total=   3.4s
[CV] estimator__linearregression__fit_intercept=False, estimator__polynomialfeatures__degree=3, estimator__polynomialfeatures__include_bias=False, estimator__polynomialfeatures__interaction_only=False, estimator__polynomialfeatures__order=F 
[CV]  estimator__linearregression__fit_intercept=False, estimator__polynomialfeatures__degree=3, estimator__polynomialfeatures__include_bias=False, estimator__polynomialfeatures__interaction_only=False, estimator__polynomialfeatures__order=F, neg_mean_squared_error=(train=-0.010, test=-0.010), neg_median_absolute_error=(train=-0.050, test=-0.051), r2=(train=0.788

[CV]  estimator__linearregression__fit_intercept=False, estimator__polynomialfeatures__degree=4, estimator__polynomialfeatures__include_bias=False, estimator__polynomialfeatures__interaction_only=True, estimator__polynomialfeatures__order=F, neg_mean_squared_error=(train=-0.019, test=-0.020), neg_median_absolute_error=(train=-0.064, test=-0.066), r2=(train=0.581, test=0.559), total=   6.6s
[CV] estimator__linearregression__fit_intercept=False, estimator__polynomialfeatures__degree=4, estimator__polynomialfeatures__include_bias=False, estimator__polynomialfeatures__interaction_only=True, estimator__polynomialfeatures__order=F 
[CV]  estimator__linearregression__fit_intercept=False, estimator__polynomialfeatures__degree=4, estimator__polynomialfeatures__include_bias=False, estimator__polynomialfeatures__interaction_only=True, estimator__polynomialfeatures__order=F, neg_mean_squared_error=(train=-0.020, test=-0.020), neg_median_absolute_error=(train=-0.066, test=-0.066), r2=(train=0.571, 

[Parallel(n_jobs=1)]: Done  64 out of  64 | elapsed: 11.7min finished


ValueError: Length mismatch: Expected axis has 8 elements, new values have 7 elements

In [None]:
df_rands_pr_perfm.to_csv('df_rands_pr_perfm.csv')
#model.get_params().keys()
#PolynomialFeatures().get_params