In [5]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score
from sklearn.model_selection import ShuffleSplit, GridSearchCV

from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.preprocessing import MinMaxScaler

# Decription

This notebook implements the models used in the testability research paper. It also uses the best hyperparameters which are highlighted in the paper and then makes the voting regressor and calculates accuracy on the our dataset. 

# Dataset and Cleaning

In [6]:
data = pd.read_csv("./dataset/researchDataset/DS07012.csv")

data.drop('Class', axis=1, inplace=True)
corr_matrix = data.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
data.drop(to_drop, axis=1, inplace=True)
data.columns

df = data.copy()
label = df["Testability"]
df.drop('Testability', axis=1, inplace=True)
scaler = MinMaxScaler()
scaler.fit(df)
df = scaler.transform(df)
df = pd.DataFrame(df)

(X_train, X_test, y_train, y_test) = train_test_split(df, label, random_state=100)

# Models

## Histogram based Gradient Boost Regressor

In [7]:
from sklearn.ensemble import HistGradientBoostingRegressor

clf = HistGradientBoostingRegressor(loss='squared_error', max_depth=18, min_samples_leaf=15, max_iter=500).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('HGBR MAE:',mean_absolute_error(y_test, y_pred))
print('HGBR RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('HGBR MedAE:',median_absolute_error(y_test, y_pred))

HGBR MAE: 0.11803233456348623
HGBR RMSE: 0.16526973171232745
HGBR MedAE: 0.08379819776077951


## Linear Regressor

In [8]:
from sklearn import linear_model

clf = linear_model.SGDRegressor(loss='huber', penalty='l2', learning_rate='invscaling', max_iter=50).fit(X_train, y_train) 
y_pred = clf.predict(X_test)
print('Linear MAE:',mean_absolute_error(y_test, y_pred))
print('Linear RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('Linear MedAE:',median_absolute_error(y_test, y_pred))

Linear MAE: 0.18599290180846545
Linear RMSE: 0.22963908053226662
Linear MedAE: 0.15822532071600456


## SVMR

In [9]:
from sklearn.svm import NuSVR

clf = NuSVR(kernel='rbf', nu=0.5).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('SVMR MAE:',mean_absolute_error(y_test, y_pred))
print('SVMR RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('SVMR MedAE:',median_absolute_error(y_test, y_pred))

SVMR MAE: 0.13224962304813254
SVMR RMSE: 0.18007823900341396
SVMR MedAE: 0.09089355855852835


## DTR

In [11]:
regressor = DecisionTreeRegressor(criterion='squared_error', max_depth=8, min_samples_leaf=28).fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print('DTR MAE:',mean_absolute_error(y_test, y_pred))
print('DTR RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('DTR MedAE:',median_absolute_error(y_test, y_pred))

DTR MAE: 0.14428506530469518
DTR RMSE: 0.1961454402014521
DTR MedAE: 0.10713561013983577


## RFR

In [12]:
regressor = RandomForestRegressor(n_estimators=150, max_depth=28, min_samples_leaf=2, criterion='squared_error').fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print('RFR MAE:',mean_absolute_error(y_test, y_pred))
print('RFR RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('RFR MedAE:',median_absolute_error(y_test, y_pred))

RFR MAE: 0.11975334299350378
RFR RMSE: 0.16878253254772543
RFR MedAE: 0.08354096548803652


## MLP

In [15]:
regressor = MLPRegressor(random_state=7, hidden_layer_sizes=(512, 256, 100), activation='tanh', learning_rate='constant').fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print('MLPR MAE:',mean_absolute_error(y_test, y_pred))
print('MLPR RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('MLPR MedAE:',median_absolute_error(y_test, y_pred))

MLPR MAE: 0.14337790864984926
MLPR RMSE: 0.19807072853096983
MLPR MedAE: 0.1024361991005591




## Votting Regression of RFR, HGBR and MLPR

In [16]:
regressor1 = MLPRegressor(random_state=7, hidden_layer_sizes=(512, 256, 100), activation='tanh', learning_rate='constant')
regressor2 = RandomForestRegressor(n_estimators=150, max_depth=28, min_samples_leaf=2, criterion='squared_error')
regressor3 = HistGradientBoostingRegressor(loss='squared_error', max_depth=18, min_samples_leaf=15, max_iter=500)
er = VotingRegressor([('MLP', regressor1), ('RFR', regressor2), ('HGBR', regressor3)])

er.fit(X_train, y_train)
y_pred = er.predict(X_test)
print('Voting MAE:',mean_absolute_error(y_test, y_pred))
print('Voting RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('Voting MedAE:',median_absolute_error(y_test, y_pred))



Voting MAE: 0.11785408369593284
Voting RMSE: 0.16483762518122372
Voting MedAE: 0.08277470633180838
