In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score
from sklearn.model_selection import ShuffleSplit, GridSearchCV

from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.preprocessing import MinMaxScaler

# Decription

This notebook implements ensemble machine learning techniques for the models we have found. 

# Dataset and Cleaning

In [3]:
data = pd.read_csv("./dataset/researchDataset/DS07012.csv")

data.drop('Class', axis=1, inplace=True)
corr_matrix = data.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
data.drop(to_drop, axis=1, inplace=True)
data.columns

df = data.copy()
label = df["Testability"]
df.drop('Testability', axis=1, inplace=True)
scaler = MinMaxScaler()
scaler.fit(df)
df = scaler.transform(df)
df = pd.DataFrame(df)

(X_train, X_test, y_train, y_test) = train_test_split(df, label, random_state=100)

# Ensemble Techniques

## Stacking

In [4]:
from sklearn.ensemble import StackingRegressor

regressor1 = MLPRegressor(random_state=7, hidden_layer_sizes=(512, 256, 100), activation='tanh', learning_rate='constant')
regressor2 = RandomForestRegressor(n_estimators=150, max_depth=28, min_samples_leaf=2, criterion='squared_error')
regressor3 = HistGradientBoostingRegressor(loss='squared_error', max_depth=18, min_samples_leaf=15, max_iter=500)

estimators = [
     ('lr', regressor1),
     ('svr', regressor2),
    ('rf', regressor3)
]

reg = StackingRegressor(
     estimators=estimators,
     final_estimator=RandomForestRegressor(n_estimators=10,
                                           random_state=42)
)

reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

print('Stacking MAE:',mean_absolute_error(y_test, y_pred))
print('Stacking RMSE:',mean_squared_error(y_test, y_pred, squared = False))
print('Stacking MedAE:',median_absolute_error(y_test, y_pred))



Stacking MAE: 0.1292664009845169
Stacking RMSE: 0.17903887184041653
Stacking MedAE: 0.09169299911094406
