### Test decision tree

In [None]:
# Import libraries
from sklearn import datasets, svm
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap

df = pd.read_csv('./data/preprocessed.csv')

# Scale data
scaler = StandardScaler()
for i in range(5):
    df.iloc[:,i] = scaler.fit_transform(df.iloc[:,i].values.reshape(-1,1))

Perform simple oversampling to avoid bias in the model

In [None]:
# Set pseudo-random number generator
np.random.default_rng(seed=123)

filter = lambda value: len(df['Machine failure'][df['Machine failure'] == value])
# Build oversampled dataset
machine_failures = df[df['Machine failure']==1].reset_index().drop('index', axis=1)
oversampled_data = df.copy().values
# Init count 
count = oversampled_data.shape[0]-1
cond = df['Machine failure'] == 1
for i in range(count - cond.sum()):
    oversampled_data = np.vstack((oversampled_data,machine_failures.iloc[int(np.floor(np.random.uniform(0,filter(1)))),:].values))

# Prepare data for training
X = np.delete(oversampled_data,5, axis=1)
y = oversampled_data[:,5]

The decision tree on the oversampled dataset shows astonishing results in terms of both specificity and recall

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)

cm = confusion_matrix(clf.predict(X), y)
cm_display = ConfusionMatrixDisplay(cm).plot()

TN,TP,FP,FN = cm[0,0],cm[1,1],cm[0,1],cm[1,0]
# Print some metrics alternative to prediction accuracy
print('True Negative rate (specificity): ', TN/(TN+FN))
print('True positive rate (recall): ', TP/(TP+FP))
print('False Negative Rate: ', 1-TP/(TP+FP))
print('False positive rate: ', 1-TN/(TN+FN))
print('Positive Likelihood ratio: ', TP/FP)

In [None]:
# Save model with pickle
import pickle
import os
models_dir = 'models'

# Encoding of the filename is the following: modelname_KernelType_Cvalue_*otherParams.sav
filename = 'decision_tree.sav'
pickle.dump(clf, open(os.path.join(models_dir,filename), 'wb'))