In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score
from sklearn.model_selection import ShuffleSplit, GridSearchCV

from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.preprocessing import MinMaxScaler

# Decription

This notebook implements the new regression models not used in the research and aims at forming an ensemle technique using these models. 

# Dataset and Cleaning

In [2]:
data = pd.read_csv("./dataset/researchDataset/DS07012.csv")

data.drop('Class', axis=1, inplace=True)
corr_matrix = data.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
data.drop(to_drop, axis=1, inplace=True)
data.columns

df = data.copy()
label = df["Testability"]
df.drop('Testability', axis=1, inplace=True)
scaler = MinMaxScaler()
scaler.fit(df)
df = scaler.transform(df)
df = pd.DataFrame(df)

(X_train, X_test, y_train, y_test) = train_test_split(df, label, random_state=100)

# Models

## Lasso Regressor

In [7]:
from sklearn import linear_model

max_iter = [10, 50, 100, 1000]
warm_start = [True, False]
fit_intercept = [True, False]
selection = ['cyclic', 'random']
grid = dict(max_iter=max_iter, warm_start=warm_start, fit_intercept=fit_intercept, selection=selection)
lassoModel = linear_model.Lasso()
grid = GridSearchCV(estimator=lassoModel, param_grid=grid, n_jobs=-5)
grid.fit(X_train, y_train)

model = grid.best_estimator_
y_pred = model.predict(X_test)
print('Best Params:', grid.best_params_)
print('Lasso MAE:',mean_absolute_error(y_test, y_pred))
print('Lasso RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('Lasso MedAE:',median_absolute_error(y_test, y_pred))

Best Params: {'fit_intercept': True, 'max_iter': 10, 'selection': 'cyclic', 'warm_start': True}
Lasso MAE: 0.24977175879004274
Lasso RMSE: 0.29398210896590665
Lasso MedAE: 0.2406725025740746


## Ridge Regression

In [8]:
from sklearn import linear_model

max_iter = [10, 50, 100, 1000]
fit_intercept = [True, False]
solver = ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']

grid = dict(max_iter=max_iter, fit_intercept=fit_intercept, solver=solver)
ridgeModel = linear_model.Ridge()
grid = GridSearchCV(estimator=ridgeModel, param_grid=grid, n_jobs=-5)
grid.fit(X_train, y_train)

model = grid.best_estimator_
y_pred = model.predict(X_test)
print('Best Params:', grid.best_params_)
print('Ridge MAE:',mean_absolute_error(y_test, y_pred))
print('Ridge RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('Ridge MedAE:',median_absolute_error(y_test, y_pred))

40 fits failed out of a total of 320.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Python311\Lib\site-packages\sklearn\linear_model\_ridge.py", line 1134, in fit
    return super().fit(X, y, sample_weight=sample_weight)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python311\Lib\site-packages\sklearn\linear_model\_ridge.py", line 825, in fit
    raise ValueError(
ValueError: 'lbfgs' solver can be used only when positive=True. Please use another solver.

 0.50787096       

Best Params: {'fit_intercept': True, 'max_iter': 10, 'solver': 'svd'}
Ridge MAE: 0.15848379660105183
Ridge RMSE: 0.1999290465203572
Ridge MedAE: 0.12934896634613519


## Baysian Ridge Regression

In [15]:
n_iter = [100, 500, 1000, 10000]
fit_intercept = [True, False]
copy_X = [True]
compute_score = [True, False]

grid = dict(n_iter=n_iter, fit_intercept=fit_intercept, copy_X=copy_X)
baysianRegModel = linear_model.BayesianRidge()
grid = GridSearchCV(estimator=baysianRegModel, param_grid=grid, n_jobs=-5)
grid.fit(X_train, y_train)

model = grid.best_estimator_
y_pred = model.predict(X_test)
print('Best Params:', grid.best_params_)
print('BaysianRegModel MAE:',mean_absolute_error(y_test, y_pred))
print('BaysianRegModel RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('BaysianRegModel MedAE:',median_absolute_error(y_test, y_pred))

Best Params: {'copy_X': True, 'fit_intercept': True, 'n_iter': 100}
BaysianRegModel MAE: 0.1565341718898035
BaysianRegModel RMSE: 0.19842474348332387
BaysianRegModel MedAE: 0.12739062824149672


## SVM Regression

In [34]:
from sklearn.svm import SVR

max_iter = [100, 500, 1000]
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
degree = [1, 2, 3, 4, 5, 6]
gamma = ['scale', 'auto']
epsilon = [0.1, 0.2, 0.3, 0.4, 0.5, 1.0]
C = [1.0, 0.5, 0.1, 0.01, 0.001]

grid = dict(max_iter=max_iter, kernel=kernel, degree=degree, gamma=gamma, epsilon=epsilon, C=C)
SVMRegModel = SVR()
grid = GridSearchCV(estimator=SVMRegModel, param_grid=grid, n_jobs=-5)
grid.fit(X_train, y_train)

model = grid.best_estimator_
y_pred = model.predict(X_test)
print('Best Params:', grid.best_params_)
print('SVMRegModel MAE:',mean_absolute_error(y_test, y_pred))
print('SVMRegModel RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('SVMRegModel MedAE:',median_absolute_error(y_test, y_pred))



Best Params: {'C': 0.5, 'degree': 1, 'epsilon': 0.2, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': 1000}
SVMRegModel MAE: 0.18099698518198562
SVMRegModel RMSE: 0.21697804430130335
SVMRegModel MedAE: 0.1680055513612666


## Adaboost Regressor Regressor

In [3]:
from sklearn.ensemble import AdaBoostRegressor

# n_estimators = [10, 50, 500]
# learning_rate = [0.001, 0.01, 0.1, 1.0]
# loss = ['linear', 'square', 'exponential']
estimator = [RandomForestRegressor(n_estimators=150, max_depth=28, min_samples_leaf=2, criterion='squared_error'), HistGradientBoostingRegressor(loss='squared_error', max_depth=18, min_samples_leaf=15, max_iter=500)]

grid = dict(base_estimator=estimator)
AdaRegModel = AdaBoostRegressor()
grid = GridSearchCV(estimator=AdaRegModel, param_grid=grid, n_jobs=2)
grid.fit(X_train, y_train)

model = grid.best_estimator_
y_pred = model.predict(X_test)
print('Best Params:', grid.best_params_)
print('AdaRegModel MAE:',mean_absolute_error(y_test, y_pred))
print('AdaRegModel RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('AdaRegModel MedAE:',median_absolute_error(y_test, y_pred))
## Votting Regression of RFR, HGBR and MLPR