## **First ML-Algorithms - PTBDB**

In [80]:
# Load packages and Data

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

ptbdb_abnormal_df = pd.read_csv("ptbdb_abnormal.csv")
ptbdb_normal_df = pd.read_csv("ptbdb_normal.csv")


In [81]:
# Renaming the Columns
new_column_names = range(1, len(ptbdb_normal_df.columns) + 1)
ptbdb_normal_df.columns = new_column_names

new_column_names = range(1, len(ptbdb_abnormal_df.columns) + 1)
ptbdb_abnormal_df.columns = new_column_names

# Combining the two Dataframes 
ptbdb_comb_df = pd.concat([ptbdb_normal_df, ptbdb_abnormal_df], ignore_index=True)

# Transforming the Target Variable to Integer
ptbdb_comb_df[188]=ptbdb_comb_df[188].astype(int)

ptbdb_comb_df[188].value_counts()
#ptbdb_comb_df.shape

188
1    10505
0     4045
Name: count, dtype: int64

In [82]:
# Load ML-Packages

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler,  ClusterCentroids
from imblearn.metrics import classification_report_imbalanced, geometric_mean_score
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_validate

#### Preprocessing with optional Standard-Scaler

In [83]:
# Preprocessing, Abnormal == 1

ptbdb_comb_df[188] =ptbdb_comb_df[188].replace({0: 1, 1:0})

feats = ptbdb_comb_df.drop(188, axis = 1)
target = ptbdb_comb_df[188]

X_train, X_test, y_train, y_test = train_test_split(feats, target, test_size= 0.2, random_state = 321)

In [6]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### **Simple SVM**

In [84]:
svm = SVC(gamma = "scale")

svm.fit(X_train, y_train)

from sklearn.model_selection import cross_val_score

scores = cross_val_score(svm, X_train, y_train, cv=5)
print(f"Kreuzvalidierungs-Genauigkeit: {scores.mean():.2f} +/- {scores.std():.2f}")

print(f"Verwendeter gamma-Wert: {svm._gamma}")

Kreuzvalidierungs-Genauigkeit: 0.90 +/- 0.01
Verwendeter gamma-Wert: 0.11473276029088923


In [85]:
y_pred = svm.predict(X_test)
print(pd.crosstab(y_test, y_pred, colnames = ["Predictions"], rownames = ["Reality"]), "\n")
print(classification_report_imbalanced(y_test, y_pred))

Predictions     0    1
Reality               
0            1992  109
1             142  667 

                   pre       rec       spe        f1       geo       iba       sup

          0       0.93      0.95      0.82      0.94      0.88      0.79      2101
          1       0.86      0.82      0.95      0.84      0.88      0.77       809

avg / total       0.91      0.91      0.86      0.91      0.88      0.79      2910



##### Most important metric to optimize: Recall
- With SC: pre 0.88, rec 0.87
- Without SC: pre 86, rec 0.82 

### **Undersampling with Cluster Centroids**

In [86]:
cc = ClusterCentroids()
X_cc, y_cc = cc.fit_resample(X_train, y_train)
print('Klassenverteilung CC :', dict(pd.Series(y_cc).value_counts()))

Klassenverteilung CC : {0: np.int64(3236), 1: np.int64(3236)}


In [87]:
svm = SVC(gamma='scale')
svm.fit(X_cc, y_cc)

y_pred = svm.predict(X_test)
print(pd.crosstab(y_test, y_pred, colnames = ["Predictions"], rownames = ["Reality"]), "\n")
print(classification_report_imbalanced(y_test, y_pred))

Predictions     0    1
Reality               
0            1645  456
1              30  779 

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.78      0.96      0.87      0.87      0.74      2101
          1       0.63      0.96      0.78      0.76      0.87      0.77       809

avg / total       0.88      0.83      0.91      0.84      0.87      0.75      2910



- The recall is good - yet the precision lacks significantly
- With SC: pre 0.63, rec 0.96
- Without SC: pre 0.63, rec 0.96

### **Oversampling with SMOTE**

In [88]:
smo=SMOTE()
X_sm,y_sm = smo.fit_resample(X_train, y_train)
print("Klassen Anzahl Oversampled (SMO): ", dict(pd.Series(y_sm).value_counts()))

Klassen Anzahl Oversampled (SMO):  {0: np.int64(8404), 1: np.int64(8404)}


In [89]:
svm = SVC(gamma = "scale")
svm.fit(X_sm, y_sm)
y_pred = svm.predict(X_test)
print(pd.crosstab(y_test, y_pred, rownames = ["Reality"], colnames = ["Predictions"]), "\n")
print(classification_report_imbalanced(y_test, y_pred))

Predictions     0    1
Reality               
0            1839  262
1              35  774 

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.88      0.96      0.93      0.92      0.83      2101
          1       0.75      0.96      0.88      0.84      0.92      0.84       809

avg / total       0.92      0.90      0.93      0.90      0.92      0.83      2910



- In comparison to the undersampling with CC the Precision is better, the recall is comparable
- With SC: pre 0.78, rec 0.95
- Without SC: 0.75, rec 0.96

### **SVM with SMOTE and decreased probability threshold (0.4)**

In [90]:
# since the most promising values so far came with oversampling with SMOTE I used SMOTE for this Model

svm = SVC(probability=True, gamma='scale') 
svm.fit(X_sm, y_sm)                        

threshold = 0.4 

probs = svm.predict_proba(X_test)

pred_class = (probs[:,1]>=threshold).astype('int')

pd.crosstab(y_test, pred_class)
print("\n")
print(classification_report_imbalanced(y_test, pred_class))



                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.89      0.95      0.93      0.92      0.84      2101
          1       0.76      0.95      0.89      0.85      0.92      0.85       809

avg / total       0.92      0.90      0.94      0.91      0.92      0.84      2910



- Depending on the value for threshold the recall can be increased significantly, yet at cost of the precision.
- With SC: prec 0.78, rec 0.95
- Without SC: prec 0.76, rec 0.95

### **Balanced Random Forest Classifier - !!!** 

In [91]:
from imblearn.ensemble import BalancedRandomForestClassifier

bclf = BalancedRandomForestClassifier()
bclf.fit(X_train, y_train) 
y_pred = bclf.predict(X_test)
pd.crosstab(y_test, y_pred)
print("\n")
print(classification_report_imbalanced(y_test, y_pred))

  warn(
  warn(
  warn(




                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.95      0.98      0.97      0.96      0.92      2101
          1       0.87      0.98      0.95      0.92      0.96      0.93       809

avg / total       0.96      0.95      0.97      0.95      0.96      0.92      2910



##### **Extremely good results!**
- With SC: prec 0.88, rec 0.98
- Without SC: prec 0.87, rec 0.98

### **Different ML-Algorithms with external Cross-Validation of Hyperparameters**
- Preprocessing with CC

In [97]:
clf_lr = LogisticRegression(random_state=22, max_iter=2000)
clf_rf = RandomForestClassifier(random_state=22)
clf_svc = SVC(random_state=22)

param_grid_lr = {'solver': ['liblinear', 'lbfgs'], 'C': np.logspace(-4, 2, 9)}

param_grid_rf = [{'n_estimators': [10, 50, 100, 500], 
                  'min_samples_leaf': [1, 3, 5], 
                  'max_features': ['sqrt', 'log2'], 
                  n_jobs : -1}]

param_grid_svc = [{'kernel': ['rbf'], 'C': np.logspace(-4, 4, 9), 'gamma': np.logspace(-4, 0, 4)},
                  {'kernel': ['linear'], 'C': np.logspace(-4, 4, 9)}]

NameError: name 'n_jobs' is not defined

In [None]:
gridcvs = {}

for pgrid, clf, name in zip((param_grid_lr, param_grid_rf, param_grid_svc),
                            (clf_lr, clf_rf, clf_svc),
                            ('LogisticRegression', 'RF', 'SVM')):
    gcv = GridSearchCV(clf, pgrid, cv=3, refit=True, scoring='recall', n_jobs = -1)
    gridcvs[name] = gcv
    
print(gridcvs)

In [20]:
outer_cv = StratifiedKFold(n_splits=3, shuffle=True)
outer_scores = {}

for name, gs in gridcvs.items():
    # Perform cross-validation and calculate recall
    cv_results = cross_validate(gs, X_cc, y_cc, cv=outer_cv, scoring='recall', return_estimator=True, return_train_score=False)
    
    nested_score = cv_results['test_score']
    outer_scores[name] = nested_score
    print(f'{name}: outer recall score {100*nested_score.mean():.2f} +/- {100*nested_score.std():.2f}', "\n")

: 

: 

- LogisticRegression: outer recall score 76.76 +/- 1.37
- RF: outer recall score 93.60 +/- 0.99 

In [None]:
from sklearn.metrics import accuracy_score

final_clf = gridcvs['LogisticRegression']
final_clf.fit(X_sm, y_sm)
y_pred = final_clf.predict(X_test)

print(f'Best Parameters: {final_clf.best_params_}, "\n"')

pd.crosstab(y_test, y_pred)
print("\n")
print(classification_report_imbalanced(y_test, y_pred))

- ..

#### **SVM - Cross-Validation**

#### **Isolation Forest - Outlier Detection**

In [93]:
from sklearn.ensemble import IsolationForest

# Preprocessing, Abnormal == -1

ptbdb_comb_df[188] =ptbdb_comb_df[188].replace({0: -1, 1:1})

feats = ptbdb_comb_df.drop(188, axis = 1)
target = ptbdb_comb_df[188]

X_train, X_test, y_train, y_test = train_test_split(feats, target, test_size= 0.2, random_state = 321)

# since our data is not normal distributed I decided not to use Standard-Scaler here. 

In [94]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [95]:
isof = IsolationForest(contamination = 0.278, n_estimators = 100, n_jobs=-1)

isof.fit(X_train)
y_pred = isof.predict(X_test)
pd.crosstab(y_test, y_pred)
print ("\n")
print(classification_report_imbalanced(y_test, y_pred))



                   pre       rec       spe        f1       geo       iba       sup

         -1       0.78      0.31      0.78      0.45      0.49      0.23      2101
          1       0.30      0.78      0.31      0.44      0.49      0.25       809

avg / total       0.65      0.44      0.65      0.44      0.49      0.24      2910



- With SC: prec 0.81, rec 0.32
- Without SC: prec 0.81, rec 0.31

In [96]:
# Recall Optimization with Cross-Validation and Hyperparameter Optimization

skf = StratifiedKFold(n_splits=3) 
folds = list(skf.split(X_train, y_train))
forest = IsolationForest()

from sklearn.metrics import make_scorer, recall_score
resc = make_scorer(recall_score,pos_label=-1)

params = {'contamination': np.linspace(0.001, 0.1, 10), 'n_estimators': [100,200,300]}

search = GridSearchCV(estimator=forest, param_grid=params, scoring=resc, cv=folds, n_jobs=-1)
search.fit(X_train, y_train)

best_params = search.best_params_
print("Beste Hyperparameter:", best_params, "\n")

# predict
optimal_forest = search.best_estimator_
y_pred = optimal_forest.predict(X_test)

pd.crosstab(y_test, y_pred, rownames=['Reality'], colnames=['Prediction'])
print("\n", classification_report_imbalanced(y_test, y_pred))

Beste Hyperparameter: {'contamination': np.float64(0.1), 'n_estimators': 300} 


                    pre       rec       spe        f1       geo       iba       sup

         -1       0.84      0.11      0.95      0.20      0.32      0.10      2101
          1       0.29      0.95      0.11      0.44      0.32      0.11       809

avg / total       0.69      0.34      0.71      0.26      0.32      0.10      2910



- With SC: prec 0.80, rec 0.44
- Without SC: 0.78, rec 0.44

#### **One Class SVM**

In [75]:
# Preprocessing: 

# We select only the labels that correspond to 1
y_inliers = y_train[y_train.values==1]

# We create a list of indexes to be able to recover the explanatory variables of these indexes
liste = list(y_inliers.index.values)

# We recover these explanatory variables in a new DataFrame
X_train_inliers = X_train.loc[liste]
y_train_inliers = y_train.loc[liste]

In [76]:
scaler = MinMaxScaler(feature_range = (-1,1))

X_train_inliers = scaler.fit_transform(X_train_inliers)

X_test = scaler.transform(X_test)

In [77]:
from sklearn.svm import OneClassSVM

ocsvm = OneClassSVM(nu=0.278, kernel = "rbf", gamma = "scale")

ocsvm.fit(X_train_inliers)

y_pred = ocsvm.predict(X_test)

pd.crosstab(y_test, y_pred, rownames = ["Reality"], colnames = ["Prediction"])
print("\n", classification_report(y_test,y_pred))


               precision    recall  f1-score   support

          -1       0.86      0.34      0.49      2101
           1       0.33      0.85      0.48       809

    accuracy                           0.48      2910
   macro avg       0.60      0.60      0.48      2910
weighted avg       0.71      0.48      0.49      2910



- With SC: prec 0.86, rec 0.34
- Without SC: 0.88, rec 0.69

In [78]:
# One Class SVM Grid Search

from sklearn.svm import OneClassSVM

# StratifiedKFold for cross-validation
skf = StratifiedKFold(n_splits=3)
folds = list(skf.split(X_train_inliers, y_train_inliers))

# Custom scorer for recall
resc = make_scorer(recall_score, pos_label=-1)

# Parameter grid for hyperparameter optimization with different kernels
params = {
    'kernel': ['rbf'],
    'nu': np.linspace(0.01, 0.3, 10),  # Parameter for One-Class SVM
    'gamma': ['scale', 'auto']  # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'
}

# One-Class SVM
ocsvm = OneClassSVM()

# GridSearchCV for hyperparameter optimization
search = GridSearchCV(estimator=ocsvm, param_grid=params, scoring=resc, cv=folds, n_jobs=-1)
search.fit(X_train_inliers, y_train_inliers)

# Best hyperparameters
best_params = search.best_params_
print("Beste Hyperparameter:", best_params)

# Best estimator
optimal_ocsvm = search.best_estimator_

# Predictions
y_pred = optimal_ocsvm.predict(X_test)

# Confusion matrix and classification report
print(pd.crosstab(y_test, y_pred, rownames=['Reality'], colnames=['Prediction']))
print("\n", classification_report_imbalanced(y_test, y_pred))

Beste Hyperparameter: {'gamma': 'scale', 'kernel': 'rbf', 'nu': np.float64(0.01)}
Prediction  -1     1
Reality             
-1          68  2033
 1           4   805

                    pre       rec       spe        f1       geo       iba       sup

         -1       0.94      0.03      1.00      0.06      0.18      0.03      2101
          1       0.28      1.00      0.03      0.44      0.18      0.04       809

avg / total       0.76      0.30      0.73      0.17      0.18      0.03      2910



- With SC: prec 0.94, rec 0.03
- Without SC: 0.98, rec 0.42