In [31]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score
from sklearn.model_selection import ShuffleSplit, GridSearchCV

from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.preprocessing import MinMaxScaler

# Decription

This notebook implements the models used in the testability research paper. It also uses the best hyperparameters which are highlighted in the paper and then makes the voting regressor and calculates accuracy on the our dataset. 

# Dataset and Cleaning

In [32]:
data = pd.read_csv('./dataset/newDataset/DS_K45.csv')
#data = pd.read_csv('./dataset/newDataset/DS_LassoCV_k45.csv')
#data = pd.read_csv('./dataset/newDataset/DS_PCA_k45.csv')

data.drop('Class', axis=1, inplace=True)
df = data.copy()
label = df["Testability"]
df.drop('Testability', axis=1, inplace=True)
scaler = MinMaxScaler()
scaler.fit(df)
df = scaler.transform(df)
df = pd.DataFrame(df)

(X_train, X_test, y_train, y_test) = train_test_split(df, label, random_state=100)

# Models

## Histogram based Gradient Boost Regressor

In [33]:
from sklearn.ensemble import HistGradientBoostingRegressor

clf = HistGradientBoostingRegressor(loss='squared_error', max_depth=18, min_samples_leaf=15, max_iter=500).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('HGBR MAE:',mean_absolute_error(y_test, y_pred))
print('HGBR RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('HGBR MedAE:',median_absolute_error(y_test, y_pred))

HGBR MAE: 0.13963373443883495
HGBR RMSE: 0.18888272650756716
HGBR MedAE: 0.10352956874014307


DS_K45

HGBR MAE: 0.13963373443883495
HGBR RMSE: 0.18888272650756716
HGBR MedAE: 0.10352956874014307

LassoCV_k45

HGBR MAE: 0.12130354153498595
HGBR RMSE: 0.16828666697198777
HGBR MedAE: 0.08603795796138775

PCA_k45

HGBR MAE: 0.12966535329002396
HGBR RMSE: 0.175882649044401
HGBR MedAE: 0.09808484539447493

## Linear Regressor

In [34]:
from sklearn import linear_model

clf = linear_model.SGDRegressor(loss='huber', penalty='l2', learning_rate='invscaling', max_iter=50).fit(X_train, y_train) 
y_pred = clf.predict(X_test)
print('Linear MAE:',mean_absolute_error(y_test, y_pred))
print('Linear RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('Linear MedAE:',median_absolute_error(y_test, y_pred))

Linear MAE: 0.20452989625037835
Linear RMSE: 0.24429582037810824
Linear MedAE: 0.1885627442270964


DS_K45

Linear MAE: 0.20452989625037835
Linear RMSE: 0.24429582037810824
Linear MedAE: 0.1885627442270964

LassoCV_k45

Linear MAE: 0.2008948959075644
Linear RMSE: 0.24033661405296905
Linear MedAE: 0.18450769609373258

PCA_k45

Linear MAE: 0.2413722906532826
Linear RMSE: 0.28372529407126906
Linear MedAE: 0.23342428807519486

## SVMR

In [35]:
from sklearn.svm import NuSVR

clf = NuSVR(kernel='rbf', nu=0.5).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('SVMR MAE:',mean_absolute_error(y_test, y_pred))
print('SVMR RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('SVMR MedAE:',median_absolute_error(y_test, y_pred))

SVMR MAE: 0.1534953807364677
SVMR RMSE: 0.2007402277073376
SVMR MedAE: 0.1121772649587372


DS_K45

SVMR MAE: 0.1534953807364677
SVMR RMSE: 0.2007402277073376
SVMR MedAE: 0.1121772649587372

LassoCV_k45

SVMR MAE: 0.135297025897594
SVMR RMSE: 0.18216201253787365
SVMR MedAE: 0.09654024731801442

PCA_k45

SVMR MAE: 0.16507555828034395
SVMR RMSE: 0.21274479307583685
SVMR MedAE: 0.12479211470915025

## DTR

In [36]:
regressor = DecisionTreeRegressor(criterion='squared_error', max_depth=8, min_samples_leaf=28).fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print('DTR MAE:',mean_absolute_error(y_test, y_pred))
print('DTR RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('DTR MedAE:',median_absolute_error(y_test, y_pred))

DTR MAE: 0.15391301020296494
DTR RMSE: 0.20677552463999554
DTR MedAE: 0.11702952617671414


DS_K45

SVMR MAE: 0.1534953807364677
SVMR RMSE: 0.2007402277073376
SVMR MedAE: 0.1121772649587372

LassoCV_k45

DTR MAE: 0.15113857889096238
DTR RMSE: 0.20363776643813164
DTR MedAE: 0.11185296384592996

PCA_k45

DTR MAE: 0.15666244433667845
DTR RMSE: 0.20628217130147822
DTR MedAE: 0.12630915967590484

## RFR

In [37]:
regressor = RandomForestRegressor(n_estimators=150, max_depth=28, min_samples_leaf=2, criterion='squared_error').fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print('RFR MAE:',mean_absolute_error(y_test, y_pred))
print('RFR RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('RFR MedAE:',median_absolute_error(y_test, y_pred))

RFR MAE: 0.1358764691078604
RFR RMSE: 0.18705466957352396
RFR MedAE: 0.09974843616059165


DS_K45

RFR MAE: 0.1358764691078604
RFR RMSE: 0.18705466957352396
RFR MedAE: 0.09974843616059165

LassoCV_k45

RFR MAE: 0.12201637670321323
RFR RMSE: 0.17099420255830888
RFR MedAE: 0.0854117287814817

PCA_k45

RFR MAE: 0.12930285240898698
RFR RMSE: 0.17803374564584556
RFR MedAE: 0.0966047256829525

## MLP

In [38]:
regressor = MLPRegressor(random_state=7, hidden_layer_sizes=(512, 256, 100), activation='tanh', learning_rate='constant').fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print('MLPR MAE:',mean_absolute_error(y_test, y_pred))
print('MLPR RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('MLPR MedAE:',median_absolute_error(y_test, y_pred))

MLPR MAE: 0.1508986235877407
MLPR RMSE: 0.19778505608523012
MLPR MedAE: 0.1176074540863456


DS_K45

MLPR MAE: 0.1508986235877407
MLPR RMSE: 0.19778505608523012
MLPR MedAE: 0.1176074540863456

LassoCV_k45

MLPR MAE: 0.13285476554404604
MLPR RMSE: 0.18032353788086458
MLPR MedAE: 0.10116264676686204

PCA_k45

MLPR MAE: 0.14927644187483038
MLPR RMSE: 0.19754082418227376
MLPR MedAE: 0.11346236151811148

## Votting Regression of RFR, HGBR and MLPR

In [39]:
regressor1 = MLPRegressor(random_state=7, hidden_layer_sizes=(512, 256, 100), activation='tanh', learning_rate='constant')
regressor2 = RandomForestRegressor(n_estimators=150, max_depth=28, min_samples_leaf=2, criterion='squared_error')
regressor3 = HistGradientBoostingRegressor(loss='squared_error', max_depth=18, min_samples_leaf=15, max_iter=500)
er = VotingRegressor([('MLP', regressor1), ('RFR', regressor2), ('HGBR', regressor3)])

er.fit(X_train, y_train)
y_pred = er.predict(X_test)
print('Voting MAE:',mean_absolute_error(y_test, y_pred))
print('Voting RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('Voting MedAE:',median_absolute_error(y_test, y_pred))

Voting MAE: 0.13954868104353518
Voting RMSE: 0.1869451935293196
Voting MedAE: 0.10501759544540365


DS_K45

Voting MAE: 0.13954868104353518
Voting RMSE: 0.1869451935293196
Voting MedAE: 0.10501759544540365

LassoCV_k45

Voting MAE: 0.12113508673808417
Voting RMSE: 0.16694807733516653
Voting MedAE: 0.08789728590847064

PCA_k45

Voting MAE: 0.12847763977498114
Voting RMSE: 0.1740451280608631
Voting MedAE: 0.09429364861223682