In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score
from sklearn.model_selection import ShuffleSplit, GridSearchCV

from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Decription

This notebook implements the new regression models not used in the research and aims at forming an ensemle technique using these models. 

# Dataset and Cleaning

In [3]:
data = pd.read_csv("DS07012.csv")

data.drop('Class', axis=1, inplace=True)
corr_matrix = data.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
data.drop(to_drop, axis=1, inplace=True)
data.columns

df = data.copy()
label = df["Testability"]
df.drop('Testability', axis=1, inplace=True)

scaler = StandardScaler()
scaler.fit_transform(df)

df = scaler.transform(df)
df = pd.DataFrame(df)

(X_train, X_test, y_train, y_test) = train_test_split(df, label, random_state=100)

# Models

## Lasso Regressor

In [5]:
from sklearn import linear_model

max_iter = [10, 50, 100, 1000]
warm_start = [True, False]
fit_intercept = [True, False]
selection = ['cyclic', 'random']
grid = dict(max_iter=max_iter, warm_start=warm_start, fit_intercept=fit_intercept, selection=selection)
lassoModel = linear_model.Lasso()
grid = GridSearchCV(estimator=lassoModel, param_grid=grid, n_jobs=-5)
grid.fit(X_train, y_train)

model = grid.best_estimator_
y_pred = model.predict(X_test)
print('Best Params:', grid.best_params_)
print('Lasso MAE:',mean_absolute_error(y_test, y_pred))
print('Lasso RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('Lasso MedAE:',median_absolute_error(y_test, y_pred))

Best Params: {'fit_intercept': True, 'max_iter': 10, 'selection': 'cyclic', 'warm_start': True}
Lasso MAE: 0.24977175879004274
Lasso RMSE: 0.29398210896590665
Lasso MedAE: 0.2406725025740746


## Ridge Regression

In [14]:
from sklearn import linear_model

max_iter = [10, 50, 100, 1000]
fit_intercept = [True, False]
solver = ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']

grid = dict(max_iter=max_iter, fit_intercept=fit_intercept, solver=solver)
ridgeModel = linear_model.Ridge()
grid = GridSearchCV(estimator=ridgeModel, param_grid=grid, n_jobs=-5)
grid.fit(X_train, y_train)

model = grid.best_estimator_
y_pred = model.predict(X_test)
print('Best Params:', grid.best_params_)
print('Ridge MAE:',mean_absolute_error(y_test, y_pred))
print('Ridge RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('Ridge MedAE:',median_absolute_error(y_test, y_pred))



KeyboardInterrupt: 

## Baysian Ridge Regression

In [23]:
n_iter = [100, 500, 1000, 10000]
fit_intercept = [True, False]
copy_X = [True]
compute_score = [True, False]

grid = dict(n_iter=n_iter, fit_intercept=fit_intercept, copy_X=copy_X)
baysianRegModel = linear_model.BayesianRidge()
grid = GridSearchCV(estimator=baysianRegModel, param_grid=grid, n_jobs=-5)
grid.fit(X_train, y_train)

model = grid.best_estimator_
y_pred = model.predict(X_test)
print('Best Params:', grid.best_params_)
print('BaysianRegModel MAE:',mean_absolute_error(y_test, y_pred))
print('BaysianRegModel RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('BaysianRegModel MedAE:',median_absolute_error(y_test, y_pred))

Best Params: {'copy_X': True, 'fit_intercept': True, 'n_iter': 100}
BaysianRegModel MAE: 0.15614017479213485
BaysianRegModel RMSE: 0.19827154648830422
BaysianRegModel MedAE: 0.12714508970443233


## SVM Regression

In [4]:
from sklearn.svm import SVR

# max_iter = [100, 500, 1000]
# adjust grid search by increasing max_iter and add tol
max_iter = [500, 1000, 2000]
tol = [0.001, 0.01, 0.1]

kernel = ['linear', 'poly', 'rbf', 'sigmoid']
degree = [1, 2, 3, 4, 5, 6]
gamma = ['scale', 'auto']
epsilon = [0.1, 0.2, 0.3, 0.4, 0.5, 1.0]
C = [1.0, 0.5, 0.1, 0.01, 0.001]

# grid = dict(max_iter=max_iter, kernel=kernel, degree=degree, gamma=gamma, epsilon=epsilon, C=C)
# add tol
grid = dict(max_iter=max_iter, kernel=kernel, degree=degree, gamma=gamma, epsilon=epsilon, C=C, tol=tol)

SVMRegModel = SVR()
grid = GridSearchCV(estimator=SVMRegModel, param_grid=grid, n_jobs=-5)
grid.fit(X_train, y_train)

model = grid.best_estimator_
y_pred = model.predict(X_test)
print('Best Params:', grid.best_params_)
print('SVMRegModel MAE:',mean_absolute_error(y_test, y_pred))
print('SVMRegModel RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('SVMRegModel MedAE:',median_absolute_error(y_test, y_pred))



KeyboardInterrupt: 

## Adaboost Regressor Regressor

In [34]:
from sklearn.ensemble import AdaBoostRegressor

# n_estimators = [10, 50, 500]
# learning_rate = [0.001, 0.01, 0.1, 1.0]
# loss = ['linear', 'square', 'exponential']
estimator = [RandomForestRegressor(n_estimators=150, max_depth=28, min_samples_leaf=2, criterion='squared_error'), HistGradientBoostingRegressor(loss='squared_error', max_depth=18, min_samples_leaf=15, max_iter=500)]

grid = dict(base_estimator=estimator)
AdaRegModel = AdaBoostRegressor()
grid = GridSearchCV(estimator=AdaRegModel, param_grid=grid, n_jobs=2)
grid.fit(X_train, y_train)

model = grid.best_estimator_
y_pred = model.predict(X_test)
print('Best Params:', grid.best_params_)
print('AdaRegModel MAE:',mean_absolute_error(y_test, y_pred))
print('AdaRegModel RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('AdaRegModel MedAE:',median_absolute_error(y_test, y_pred))
## Votting Regression of RFR, HGBR and MLPR

exception calling callback for <Future at 0x7f6e56b76d40 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "/opt/conda/lib/python3.10/site-packages/joblib/parallel.py", line 359, in __call__
    self.parallel.dispatch_next()
  File "/opt/conda/lib/python3.10/site-packages/joblib/parallel.py", line 794, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/opt/conda/lib/python3.10/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/opt/conda/lib/python3.10/site-packages/joblib/parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/opt/conda/lib/python3.10/site-packages/joblib/_parallel_backends.py", line 531, in apply_async
    future = self._workers.submit(SafeFunction(func))
  File "/opt/cond

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/joblib/externals/loky/backend/resource_tracker.py", line 281, in main
    del registry[rtype][name]
KeyError: '/loky-349-9zgcvvm2'
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/joblib/externals/loky/backend/resource_tracker.py", line 281, in main
    del registry[rtype][name]
KeyError: '/tmp/joblib_memmapping_folder_349_1f0c2021e23748908c3563979d33cb55_bec3edefb3da4bbea922b2e80ee9017f/349-140111877993472-f9ab8f2043a34278a58f9082de044975.pkl'
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/joblib/externals/loky/backend/resource_tracker.py", line 281, in main
    del registry[rtype][name]
KeyError: '/tmp/joblib_memmapping_folder_349_1f0c2021e23748908c3563979d33cb55_90e1bf2d0f69423e87e7150e31ddad65'
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/joblib/externals/loky/backend/resource_tracker.py", line