# Creating a Pipeline of the Best Models

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
import optuna

In [2]:
X_resampled = pd.read_csv('X_full.csv')
y_resampled = pd.read_csv('y_full.csv')

In [3]:
# create a function to split the dataset into train, validation and test sets
def data_split(X, y, train_size, val_size, test_size, shuffle = True):
    if shuffle:
        idx = np.arange(len(X))
        np.random.shuffle(idx)
        X = X.iloc[idx].reset_index(drop=True)
        y = y.iloc[idx].reset_index(drop=True)

    # check the portion sizes
    if (train_size + val_size + test_size) != 1:
        portion_size = train_size + val_size + test_size * 100
        # normalize portion sizes
        train_size = train_size/ portion_size
        val_size = val_size/ portion_size
        test_size = test_size/ portion_size

    # split data
    split_data = []
    start = 0
    for size in [train_size, val_size, test_size]:
        end = start + int(size * len(X))
        split_data.append((X.iloc[start:end], y.iloc[start:end]))
        start = end
    return split_data

In [4]:
train, val, test = data_split(X_resampled, y_resampled, 0.6, 0.2, 0.2)

train_X, train_y = train
val_X, val_y = val
test_X, test_y = test
train_y = train_y.to_numpy().ravel()
val_y = val_y.to_numpy().ravel()
test_y = test_y.to_numpy().ravel()

# show shape of all the data
print("train_X shape:", train_X.shape)
print("train_y shape:", train_y.shape)
print("val_X shape:", val_X.shape)
print("val_y shape:", val_y.shape)
print("test_X shape:", test_X.shape)
print("test_y shape:", test_y.shape)

train_X shape: (176, 22)
train_y shape: (176,)
val_X shape: (58, 22)
val_y shape: (58,)
test_X shape: (58, 22)
test_y shape: (58,)


In [5]:
train_X

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,MDVP:APQ,Shimmer:DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE
0,193.030,208.900,80.297,0.00766,0.00004,0.00450,0.00389,0.01351,0.03044,0.275,...,0.02084,0.05312,0.00947,21.934,0.497554,0.740539,-5.845099,0.278679,2.608749,0.185668
1,116.286,177.291,96.983,0.00314,0.00003,0.00134,0.00192,0.00403,0.01564,0.136,...,0.01691,0.02001,0.00737,24.199,0.598515,0.654331,-5.592584,0.133917,2.058658,0.214346
2,176.824,215.293,83.961,0.00460,0.00003,0.00209,0.00221,0.00628,0.01169,0.117,...,0.01104,0.01603,0.01161,27.166,0.400088,0.656182,-4.711007,0.281618,2.655744,0.234809
3,223.361,263.872,87.638,0.00352,0.00002,0.00169,0.00188,0.00506,0.02536,0.225,...,0.01909,0.04137,0.01493,20.366,0.566849,0.574282,-5.456811,0.345238,2.840556,0.232861
4,120.267,137.244,114.820,0.00333,0.00003,0.00155,0.00202,0.00466,0.01608,0.140,...,0.01351,0.02337,0.00607,24.886,0.596040,0.764112,-5.634322,0.257682,1.854785,0.211756
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,113.400,133.344,107.816,0.00451,0.00004,0.00219,0.00283,0.00658,0.04879,0.431,...,0.04370,0.07154,0.02278,19.013,0.647900,0.708144,-4.378916,0.300067,2.445502,0.259451
172,122.964,130.049,114.676,0.00428,0.00003,0.00124,0.00155,0.00373,0.01681,0.154,...,0.01400,0.02789,0.00462,24.971,0.538688,0.754073,-6.482096,0.264967,2.054419,0.128872
173,201.774,262.707,78.228,0.00694,0.00003,0.00412,0.00396,0.01235,0.02574,0.255,...,0.01758,0.04363,0.04441,19.368,0.508479,0.683761,-6.934474,0.159890,2.316346,0.112838
174,127.930,138.752,112.173,0.00605,0.00005,0.00321,0.00318,0.00962,0.03235,0.339,...,0.03105,0.04079,0.02663,19.651,0.654945,0.675865,-5.498456,0.234196,2.103014,0.216638


## PowerTransformer - PCA - DecisionTree

In [9]:
def objective(trial):
    # PowerTransformer
    pt_method = trial.suggest_categorical('pt_method', ['yeo-johnson'])
    pt = PowerTransformer(method=pt_method)

    # PCA
    pca_n_components = trial.suggest_int('pca_n_components', 1, train_X.shape[1], log=False)
    pca = PCA(n_components=pca_n_components)

    # Decision Tree
    dt_max_depth = trial.suggest_int('dt_max_depth', 1, 23, log=False)
    dt_max_feats = trial.suggest_int('dt_max_feats', 1, 23, log=False)
    dt = DecisionTreeClassifier(max_depth=dt_max_depth)

    # Create the pipeline
    pipeline = Pipeline([
        ('power_transformer', pt),
        ('pca', pca),
        ('classifier', dt)
    ])

    # Fit the pipeline on the training data
    pipeline.fit(train_X, train_y)

    # Evaluate the pipeline on the validation set
    y_pred_val = pipeline.predict(val_X)
    accuracy_val = accuracy_score(val_y, y_pred_val)

    return 1.0 - accuracy_val

# Create and run the Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Get the best hyperparameters
best_params = study.best_params

# Train the best pipeline on the entire training set
best_pipeline = Pipeline([
    ('power_transformer', PowerTransformer(method=best_params['pt_method'])),
    ('pca', PCA(n_components=best_params['pca_n_components'])),
    ('classifier', DecisionTreeClassifier(max_depth=best_params['dt_max_depth'], max_features=best_params['dt_max_feats']))
])
best_pipeline.fit(train_X, train_y)

# Evaluate the best pipeline on the test set
y_pred_test = best_pipeline.predict(test_X)
accuracy_test = accuracy_score(test_y, y_pred_test)
print(f"Pipeline Accuracy on Test Set: {accuracy_test}")

[I 2023-12-05 23:09:37,054] A new study created in memory with name: no-name-03e9c926-1b18-43df-bda7-a1e2716837a7
[I 2023-12-05 23:09:37,113] Trial 0 finished with value: 0.12068965517241381 and parameters: {'pt_method': 'yeo-johnson', 'pca_n_components': 22, 'dt_max_depth': 20, 'dt_max_feats': 1}. Best is trial 0 with value: 0.12068965517241381.
[I 2023-12-05 23:09:37,167] Trial 1 finished with value: 0.15517241379310343 and parameters: {'pt_method': 'yeo-johnson', 'pca_n_components': 11, 'dt_max_depth': 5, 'dt_max_feats': 5}. Best is trial 0 with value: 0.12068965517241381.
[I 2023-12-05 23:09:37,236] Trial 2 finished with value: 0.13793103448275867 and parameters: {'pt_method': 'yeo-johnson', 'pca_n_components': 3, 'dt_max_depth': 9, 'dt_max_feats': 12}. Best is trial 0 with value: 0.12068965517241381.
[I 2023-12-05 23:09:37,301] Trial 3 finished with value: 0.10344827586206895 and parameters: {'pt_method': 'yeo-johnson', 'pca_n_components': 6, 'dt_max_depth': 16, 'dt_max_feats': 6}

[I 2023-12-05 23:09:39,848] Trial 35 finished with value: 0.15517241379310343 and parameters: {'pt_method': 'yeo-johnson', 'pca_n_components': 4, 'dt_max_depth': 16, 'dt_max_feats': 1}. Best is trial 3 with value: 0.10344827586206895.
[I 2023-12-05 23:09:39,941] Trial 36 finished with value: 0.15517241379310343 and parameters: {'pt_method': 'yeo-johnson', 'pca_n_components': 7, 'dt_max_depth': 3, 'dt_max_feats': 5}. Best is trial 3 with value: 0.10344827586206895.
[I 2023-12-05 23:09:40,021] Trial 37 finished with value: 0.13793103448275867 and parameters: {'pt_method': 'yeo-johnson', 'pca_n_components': 12, 'dt_max_depth': 6, 'dt_max_feats': 8}. Best is trial 3 with value: 0.10344827586206895.
[I 2023-12-05 23:09:40,099] Trial 38 finished with value: 0.10344827586206895 and parameters: {'pt_method': 'yeo-johnson', 'pca_n_components': 3, 'dt_max_depth': 11, 'dt_max_feats': 3}. Best is trial 3 with value: 0.10344827586206895.
[I 2023-12-05 23:09:40,174] Trial 39 finished with value: 0.2

Pipeline Accuracy on Test Set: 0.8793103448275862


## RobustScaler - PCA - Decision Tree

In [10]:
def objective(trial):
    # PCA
    pca_n_components = trial.suggest_int('pca_n_components', 1, train_X.shape[1], log=False)
    pca = PCA(n_components=pca_n_components)

    # Decision Tree
    dt_max_depth = trial.suggest_int('dt_max_depth', 1, 23, log=False)
    dt_max_feats = trial.suggest_int('dt_max_feats', 1, 23, log=False)
    dt = DecisionTreeClassifier(max_depth=dt_max_depth)

    # Create the pipeline
    pipeline = Pipeline([
        ('robust_scaler', RobustScaler()),
        ('pca', pca),
        ('classifier', dt)
    ])

    # Fit the pipeline on the training data
    pipeline.fit(train_X, train_y)

    # Evaluate the pipeline on the validation set
    y_pred_val = pipeline.predict(val_X)
    accuracy_val = accuracy_score(val_y, y_pred_val)

    return 1.0 - accuracy_val

# Create and run the Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Get the best hyperparameters
best_params = study.best_params

# Train the best pipeline on the entire training set
best_pipeline = Pipeline([
    ('robust_scaler', RobustScaler()),
    ('pca', PCA(n_components=best_params['pca_n_components'])),
    ('classifier', DecisionTreeClassifier(max_depth=best_params['dt_max_depth'], max_features=best_params['dt_max_feats']))
])
best_pipeline.fit(train_X, train_y)

# Evaluate the best pipeline on the test set
y_pred_test = best_pipeline.predict(test_X)
accuracy_test = accuracy_score(test_y, y_pred_test)
print(f"Pipeline Accuracy on Test Set: {accuracy_test}")

[I 2023-12-05 23:09:44,007] A new study created in memory with name: no-name-1adae50f-9489-4eb2-9777-1342f7cb40eb
[I 2023-12-05 23:09:44,034] Trial 0 finished with value: 0.18965517241379315 and parameters: {'pca_n_components': 19, 'dt_max_depth': 4, 'dt_max_feats': 13}. Best is trial 0 with value: 0.18965517241379315.
[I 2023-12-05 23:09:44,053] Trial 1 finished with value: 0.2586206896551724 and parameters: {'pca_n_components': 4, 'dt_max_depth': 2, 'dt_max_feats': 12}. Best is trial 0 with value: 0.18965517241379315.
[I 2023-12-05 23:09:44,074] Trial 2 finished with value: 0.12068965517241381 and parameters: {'pca_n_components': 12, 'dt_max_depth': 5, 'dt_max_feats': 21}. Best is trial 2 with value: 0.12068965517241381.
[I 2023-12-05 23:09:44,093] Trial 3 finished with value: 0.15517241379310343 and parameters: {'pca_n_components': 13, 'dt_max_depth': 3, 'dt_max_feats': 16}. Best is trial 2 with value: 0.12068965517241381.
[I 2023-12-05 23:09:44,111] Trial 4 finished with value: 0.1

[I 2023-12-05 23:09:45,330] Trial 39 finished with value: 0.10344827586206895 and parameters: {'pca_n_components': 2, 'dt_max_depth': 22, 'dt_max_feats': 15}. Best is trial 11 with value: 0.051724137931034475.
[I 2023-12-05 23:09:45,370] Trial 40 finished with value: 0.12068965517241381 and parameters: {'pca_n_components': 11, 'dt_max_depth': 20, 'dt_max_feats': 10}. Best is trial 11 with value: 0.051724137931034475.
[I 2023-12-05 23:09:45,409] Trial 41 finished with value: 0.08620689655172409 and parameters: {'pca_n_components': 4, 'dt_max_depth': 22, 'dt_max_feats': 19}. Best is trial 11 with value: 0.051724137931034475.
[I 2023-12-05 23:09:45,443] Trial 42 finished with value: 0.051724137931034475 and parameters: {'pca_n_components': 5, 'dt_max_depth': 23, 'dt_max_feats': 20}. Best is trial 11 with value: 0.051724137931034475.
[I 2023-12-05 23:09:45,476] Trial 43 finished with value: 0.10344827586206895 and parameters: {'pca_n_components': 6, 'dt_max_depth': 23, 'dt_max_feats': 21}.

Pipeline Accuracy on Test Set: 0.9137931034482759
