In [None]:
#MIT License

#Copyright (c) [2021] [Oliver Böhme]

#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:

#The above copyright notice and this permission notice shall be included in all
#copies or substantial portions of the Software.

#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
#SOFTWARE.

In [None]:
#I. SETUP AND DATA PREPROCESSING
#Import libraries

import pandas as pd
import numpy as np

from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from dtaidistance import dtw 
from dtaidistance import dtw_visualisation as dtwvis

import tensorflow as tf
from tensorflow import keras
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, LSTM, Activation, Embedding, Flatten, LeakyReLU, BatchNormalization, Dropout
from keras.activations import relu, tanh, softmax

from sklearn.model_selection import GridSearchCV

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import IPython
np.random.seed(7)

In [None]:
#Import dataset und split train/test
data_raw = pd.read_excel('',0)
print('Shape of the Dataset: ',data_raw.shape)
data_raw.head()


In [None]:
#Define matrices from raw data
data = data_raw.iloc[:,9:28]
projects = data_raw.iloc[:,2:3]
classes = data_raw.iloc[:,6:7]

#Join matrices:
data_projects = pd.concat([projects, data], axis=1)
classes_projects = pd.concat([projects, classes], axis=1)
data_projects


In [None]:
#Reforming input features:
#Create the basic lines for each vehicle project and reset the indices
for j in range (0, 302):
    exec(f'row_{j}=data_projects.iloc[(0+(j*195)):(1+(j*195)),0:20]')
    exec(f'row_{j}=row_{j}.reset_index(drop=True)')
    
#Add all features sorted by time stamps to the basic lines per vehicle project      
    for i in range (1, 194):
        add_i=data_projects.iloc[(i+(j*195)):((i+1)+(j*195)),1:20]
        add_i=add_i.reset_index(drop=True)
        exec(f'row_{j}=pd.concat([row_{j}, add_i], axis=1)')
        exec(f'print(row_{j})')

#Now append all rows to each other, so that a matrix "data_transformed" is created.
data_transformed = {} #Create dictionairy
data_transformed = pd.DataFrame(data_transformed) #Transform dictionairy into DataFrame
for j in range (0, 302):
    exec(f'data_transformed = data_transformed.append(row_{j})')

print("Shape: ",data_transformed.shape)
data_transformed.head()


In [None]:
#Reforming output feature
#Create the basic lines for each vehicle project and reset the indices
for j in range (0, 302):
    exec(f'row_{j}=classes.iloc[(0+(j*195)):(1+(j*195)),0:20]')
    exec(f'row_{j}=row_{j}.reset_index(drop=True)')

#Now again append all rows to each other, so that a matrix data_transformed is created.
classes_transformed = {} #Create dictionairy
classes_transformed = pd.DataFrame(classes_transformed) #Transform dictionairy into DataFrame
for j in range (0, 302):
    exec(f'classes_transformed = classes_transformed.append(row_{j})')

print("Shape: ",classes_transformed.shape)
classes_transformed.head()


In [None]:
#Here you can define which part of the data will be used: First Third: 0:99 | Second Third: 99:200 | Third Third 200:301
#Define X
X = data_transformed.iloc[:,1:3687]

#Normalize input data
scaler = Normalizer().fit(X) #Define model
normalizedX = scaler.transform(X) #Normalize data
X = normalizedX #Transform into normal X

#Define y
y = classes_transformed.iloc[0:99,0:1]

#Split data in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=42)

#Convert in numpy array for efficiency reasons
X_train=np.array(X_train)
y_train=np.array(y_train)
X_test=np.array(X_test)
y_test=np.array(y_test)


#Transfer the line names of X_test into an additional variable, so that the projects can be assigned to the classifications later on
Projekte = data_transformed.iloc[0:99,0:1]
Ypsilon = classes_transformed.iloc[0:99,0:1]

#Split data in train and test sets
Projekte_train, Projekte_test, Ypsilon_train, Ypsilon_test = train_test_split(Projekte, Ypsilon, train_size=0.7, shuffle=True, random_state=42)
#print(Projekte_test)

In [None]:
#Visualization:
print(data_projects.shape)

#Call the function and parameterise the variables:
for i in range(0, 302):    
    plt.figure(figsize=(14, 6), dpi=100) #Definiere die Größe des Diagramms
    plt.plot(data_projects.iloc[(i*195):((i*195)+194),19:20], color='grey') #https://matplotlib.org/2.1.1/api/_as_gen/matplotlib.pyplot.plot.html
    plt.xlabel("time",fontsize=12, color='black')
    plt.ylabel("Number of errors",fontsize=12, color='black')
    plt.title("Error graph by projects",fontsize=14, color='black')
    plt.show()

In [None]:
#II. APPLIED MULTIVARIATE CLASSIFICATION

#1 Ada Boost Classifier

#Initialize the model:
clf_1 = AdaBoostClassifier(n_estimators=400)
param_grid = {'algorithm': ["SAMME", "SAMME.R"], 'random_state': [0,"None"]}
clf_1 = GridSearchCV(clf_1, param_grid, cv=5, verbose=1)

#Train the model:
clf_1.fit(X_train, y_train)

#Print best parameters:
print("Best parameters: ",clf_1.best_params_,"\n")

#Calculate confusion matrix:
pred_y = clf_1.predict(X_test)
y_pred = (pred_y > 0.5)
tp, fp, fn, tn = confusion_matrix(y_test, y_pred).ravel()
print("True Positives (TP) :  ",tp,"\nFalse Positives (FP):   ",fp,"\nFalse Negatives (FN):  ",fn,"\nTrue Negatives (TN) :   ",tn,"\n")

#Show Confusion Matrix:
url = 'https://glassboxmedicine.files.wordpress.com/2019/02/confusion-matrix.png'
IPython.display.Image(url, width = 800)

#Calculate scores:
score = accuracy_score(y_test, y_pred) #In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.
print("Accuracy Score:        ",score*100,"%\n")
precision = tp/(tp+fp)
print("Precision:             ",precision*100,"%")
recall = tp/(tp+fn)
print("Recall:               ",recall*100,"%")
f_one = 2*(precision*recall)/(precision+recall)
print("F1-Score:              ",f_one*100,"%")

#Assignment of project names to classifications
tp_Projekte=[]
fp_Projekte=[]
fn_Projekte=[]
tn_Projekte=[]

for j in range(0,len(pred_y)):
    if pred_y[j]+y_test[j]==0:
        #print('True Positive')
        tp_Projekte.append(Projekte_test.iloc[j])
    elif pred_y[j]+y_test[j]==2:
        #print('True Negative')
        tn_Projekte.append(Projekte_test.iloc[j])
    elif  pred_y[j]+y_test[j]==1 and pred_y[j]==0:
        #print("False Negativ")
        fn_Projekte.append(Projekte_test.iloc[j])
    elif  pred_y[j]+y_test[j]==1 and pred_y[j]==1:
        #print("False Positive")
        fp_Projekte.append(Projekte_test.iloc[j])
        
print("\n\nThe following projects are classified as True Positive: \n",tp_Projekte)
print("\n\nThe following projects are classified as False Positive: \n",fp_Projekte)
print("\n\nThe following projects are classified as False Negative: \n",fn_Projekte)
print("\n\nThe following projects are classified as True Negative: \n",tn_Projekte)

In [None]:
#2 Decision Tree Classifier

#Initialize the model:
clf_2 = DecisionTreeClassifier(random_state=0)
param_grid = {'criterion': ["gini", "entropy"], 'splitter': ["best", "random"], 'max_features': ["None", "int", "float", "auto", "sqrt", "log2"]}
clf_2 = GridSearchCV(clf_2, param_grid, cv=5, verbose=1)

#Train the model:
clf_2.fit(X_train, y_train)

#Print best parameters:
print("Best parameters: ",clf_2.best_params_,"\n")

#Calculate confusion matrix:
pred_y = clf_2.predict(X_test)  
y_pred = (pred_y > 0.5)
tp, fp, fn, tn = confusion_matrix(y_test, y_pred).ravel()
print("True Positives (TP) :  ",tp,"\nFalse Positives (FP):   ",fp,"\nFalse Negatives (FN):  ",fn,"\nTrue Negatives (TN) :   ",tn,"\n")

#Show Confusion Matrix:
url = 'https://glassboxmedicine.files.wordpress.com/2019/02/confusion-matrix.png'
IPython.display.Image(url, width = 800)

#Calculate scores:
score = accuracy_score(y_test, y_pred) #In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.
print("Accuracy Score:        ",score*100,"%\n")
precision = tp/(tp+fp)
print("Precision:             ",precision*100,"%")
recall = tp/(tp+fn)
print("Recall:               ",recall*100,"%")
f_one = 2*(precision*recall)/(precision+recall)
print("F1-Score:              ",f_one*100,"%")

#Assignment of project names to classifications
tp_Projekte=[]
fp_Projekte=[]
fn_Projekte=[]
tn_Projekte=[]

for j in range(0,len(pred_y)):
    if pred_y[j]+y_test[j]==0:
        #print('True Positive')
        tp_Projekte.append(Projekte_test.iloc[j])
    elif pred_y[j]+y_test[j]==2:
        #print('True Negative')
        tn_Projekte.append(Projekte_test.iloc[j])
    elif  pred_y[j]+y_test[j]==1 and pred_y[j]==0:
        #print("False Negativ")
        fn_Projekte.append(Projekte_test.iloc[j])
    elif  pred_y[j]+y_test[j]==1 and pred_y[j]==1:
        #print("False Positive")
        fp_Projekte.append(Projekte_test.iloc[j])
        
print("\n\nThe following projects are classified as True Positive: \n",tp_Projekte)
print("\n\nThe following projects are classified as False Positive: \n",fp_Projekte)
print("\n\nThe following projects are classified as False Negative: \n",fn_Projekte)
print("\n\nThe following projects are classified as True Negative: \n",tn_Projekte)

In [None]:
#3 Discriminant Analysis

#Initialize the model:
clf_3 = QuadraticDiscriminantAnalysis()
param_grid = {'reg_param':[0.0, 0.1, 0.5, 1], 'store_covariance': ["True", "False"], 'tol': [0.001, 0.0001, 0.00001]}
clf_3 = GridSearchCV(clf_3, param_grid, cv=5, verbose=1)

#Train the model:
clf_3.fit(X_train, y_train)

#Print best parameters:
print("Best parameters: ",clf_3.best_params_,"\n")

#Calculate confusion matrix:
pred_y = clf_3.predict(X_test)     
y_pred = (pred_y > 0.5)
tp, fp, fn, tn = confusion_matrix(y_test, y_pred).ravel()
print("True Positives (TP) :  ",tp,"\nFalse Positives (FP):   ",fp,"\nFalse Negatives (FN):  ",fn,"\nTrue Negatives (TN) :   ",tn,"\n")

#Show Confusion Matrix:
url = 'https://glassboxmedicine.files.wordpress.com/2019/02/confusion-matrix.png'
IPython.display.Image(url, width = 800)

#Calculate scores:
score = accuracy_score(y_test, y_pred) #In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.
print("Accuracy Score:        ",score*100,"%\n")
precision = tp/(tp+fp)
print("Precision:             ",precision*100,"%")
recall = tp/(tp+fn)
print("Recall:               ",recall*100,"%")
f_one = 2*(precision*recall)/(precision+recall)
print("F1-Score:              ",f_one*100,"%")

#Assignment of project names to classifications
tp_Projekte=[]
fp_Projekte=[]
fn_Projekte=[]
tn_Projekte=[]

for j in range(0,len(pred_y)):
    if pred_y[j]+y_test[j]==0:
        #print('True Positive')
        tp_Projekte.append(Projekte_test.iloc[j])
    elif pred_y[j]+y_test[j]==2:
        #print('True Negative')
        tn_Projekte.append(Projekte_test.iloc[j])
    elif  pred_y[j]+y_test[j]==1 and pred_y[j]==0:
        #print("False Negativ")
        fn_Projekte.append(Projekte_test.iloc[j])
    elif  pred_y[j]+y_test[j]==1 and pred_y[j]==1:
        #print("False Positive")
        fp_Projekte.append(Projekte_test.iloc[j])
        
print("\n\nThe following projects are classified as True Positive: \n",tp_Projekte)
print("\n\nThe following projects are classified as False Positive: \n",fp_Projekte)
print("\n\nThe following projects are classified as False Negative: \n",fn_Projekte)
print("\n\nThe following projects are classified as True Negative: \n",tn_Projekte)

In [None]:
#4 Gaussian Process Classifier

#Initialize the model:
clf_4 = GaussianProcessClassifier(random_state=0, n_jobs=-1) #Initialisiere das Modell
param_grid = {'n_restarts_optimizer': np.arange(0,11), 'max_iter_predict': [50, 100, 200, 400]}
clf_4= GridSearchCV(clf_4, param_grid, cv=5, verbose=1)

#Train the model:
clf_4.fit(X_train, y_train)

#Print best parameters:
print("Best parameters: ",clf_4.best_params_,"\n")

#Calculate confusion matrix:
pred_y = clf_4.predict(X_test)      
y_pred = (pred_y > 0.5)
tp, fp, fn, tn = confusion_matrix(y_test, y_pred).ravel()
print("True Positives (TP) :  ",tp,"\nFalse Positives (FP):   ",fp,"\nFalse Negatives (FN):  ",fn,"\nTrue Negatives (TN) :   ",tn,"\n")

#Show Confusion Matrix:
url = 'https://glassboxmedicine.files.wordpress.com/2019/02/confusion-matrix.png'
IPython.display.Image(url, width = 800)

#Calculate scores:
score = accuracy_score(y_test, y_pred) #In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.
print("Accuracy Score:        ",score*100,"%\n")
precision = tp/(tp+fp)
print("Precision:             ",precision*100,"%")
recall = tp/(tp+fn)
print("Recall:               ",recall*100,"%")
f_one = 2*(precision*recall)/(precision+recall)
print("F1-Score:              ",f_one*100,"%")

#Assignment of project names to classifications
tp_Projekte=[]
fp_Projekte=[]
fn_Projekte=[]
tn_Projekte=[]

for j in range(0,len(pred_y)):
    if pred_y[j]+y_test[j]==0:
        #print('True Positive')
        tp_Projekte.append(Projekte_test.iloc[j])
    elif pred_y[j]+y_test[j]==2:
        #print('True Negative')
        tn_Projekte.append(Projekte_test.iloc[j])
    elif  pred_y[j]+y_test[j]==1 and pred_y[j]==0:
        #print("False Negativ")
        fn_Projekte.append(Projekte_test.iloc[j])
    elif  pred_y[j]+y_test[j]==1 and pred_y[j]==1:
        #print("False Positive")
        fp_Projekte.append(Projekte_test.iloc[j])
        
print("\n\nThe following projects are classified as True Positive: \n",tp_Projekte)
print("\n\nThe following projects are classified as False Positive: \n",fp_Projekte)
print("\n\nThe following projects are classified as False Negative: \n",fn_Projekte)
print("\n\nThe following projects are classified as True Negative: \n",tn_Projekte)

In [None]:
#5. Multi Layer Perceptron Classifier

#Initialize the model:
clf_5 = MLPClassifier(random_state=1, max_iter=1000, verbose=1) #Initialisiere das Modell
#param_grid = {'hidden_layer_sizes': [50, 100, 200, 400], 'activation': ["identity", "logistic", "tanh", "relu"], 'solver': ["lbfgs", "sgd", "adam"]}
param_grid = {'hidden_layer_sizes': [6, 7, 8, 9, 10], 'activation': ["logistic", "tanh", "relu"], 'solver': ["adam"]}
clf_5 = GridSearchCV(clf_5, param_grid, cv=5, verbose=1)

#Train the model:
clf_5.fit(X_train, y_train)

#Print best parameters:
print("Best parameters: ",clf_5.best_params_,"\n")

#Calculate confusion matrix:
pred_y = clf_5.predict(X_test)      
y_pred = (pred_y > 0.5)
tp, fp, fn, tn = confusion_matrix(y_test, y_pred).ravel()
print("True Positives (TP) :  ",tp,"\nFalse Positives (FP):   ",fp,"\nFalse Negatives (FN):  ",fn,"\nTrue Negatives (TN) :   ",tn,"\n")

#Show Confusion Matrix:
url = 'https://glassboxmedicine.files.wordpress.com/2019/02/confusion-matrix.png'
IPython.display.Image(url, width = 800)

#Calculate scores:
score = accuracy_score(y_test, y_pred) #In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.
print("Accuracy Score:        ",score*100,"%\n")
precision = tp/(tp+fp)
print("Precision:             ",precision*100,"%")
recall = tp/(tp+fn)
print("Recall:               ",recall*100,"%")
f_one = 2*(precision*recall)/(precision+recall)
print("F1-Score:              ",f_one*100,"%")

#Assignment of project names to classifications
tp_Projekte=[]
fp_Projekte=[]
fn_Projekte=[]
tn_Projekte=[]

for j in range(0,len(pred_y)):
    if pred_y[j]+y_test[j]==0:
        #print('True Positive')
        tp_Projekte.append(Projekte_test.iloc[j])
    elif pred_y[j]+y_test[j]==2:
        #print('True Negative')
        tn_Projekte.append(Projekte_test.iloc[j])
    elif  pred_y[j]+y_test[j]==1 and pred_y[j]==0:
        #print("False Negativ")
        fn_Projekte.append(Projekte_test.iloc[j])
    elif  pred_y[j]+y_test[j]==1 and pred_y[j]==1:
        #print("False Positive")
        fp_Projekte.append(Projekte_test.iloc[j])
        
print("\n\nThe following projects are classified as True Positive: \n",tp_Projekte)
print("\n\nThe following projects are classified as False Positive: \n",fp_Projekte)
print("\n\nThe following projects are classified as False Negative: \n",fn_Projekte)
print("\n\nThe following projects are classified as True Negative: \n",tn_Projekte)

In [None]:
#6. Support Vector Machine

#Initialize the model:
clf_6 = SVC() #Initialisiere das Modell
param_grid = {'C': np.arange(1, 10), 'kernel': ["linear", "poly", "rbf", "sigmoid"], 'degree': np.arange(1, 10), 'gamma': ["scale", "auto"]}
clf_6 = GridSearchCV(clf_6, param_grid, cv=5, verbose=1)

#Train the model:
clf_6.fit(X_train, y_train)

#Print best parameters:
print("Best parameters: ",clf_6.best_params_,"\n")

#Calculate confusion matrix:
pred_y = clf_6.predict(X_test)      
y_pred = (pred_y > 0.5)
tp, fp, fn, tn = confusion_matrix(y_test, y_pred).ravel()
print("True Positives (TP) :  ",tp,"\nFalse Positives (FP):   ",fp,"\nFalse Negatives (FN):  ",fn,"\nTrue Negatives (TN) :   ",tn,"\n")

#Show Confusion Matrix:
url = 'https://glassboxmedicine.files.wordpress.com/2019/02/confusion-matrix.png'
IPython.display.Image(url, width = 800)

#Calculate scores:
score = accuracy_score(y_test, y_pred) #In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.
print("Accuracy Score:        ",score*100,"%\n")
precision = tp/(tp+fp)
print("Precision:             ",precision*100,"%")
recall = tp/(tp+fn)
print("Recall:               ",recall*100,"%")
f_one = 2*(precision*recall)/(precision+recall)
print("F1-Score:              ",f_one*100,"%")

#Assignment of project names to classifications
tp_Projekte=[]
fp_Projekte=[]
fn_Projekte=[]
tn_Projekte=[]

for j in range(0,len(pred_y)):
    if pred_y[j]+y_test[j]==0:
        #print('True Positive')
        tp_Projekte.append(Projekte_test.iloc[j])
    elif pred_y[j]+y_test[j]==2:
        #print('True Negative')
        tn_Projekte.append(Projekte_test.iloc[j])
    elif  pred_y[j]+y_test[j]==1 and pred_y[j]==0:
        #print("False Negativ")
        fn_Projekte.append(Projekte_test.iloc[j])
    elif  pred_y[j]+y_test[j]==1 and pred_y[j]==1:
        #print("False Positive")
        fp_Projekte.append(Projekte_test.iloc[j])
        
print("\n\nThe following projects are classified as True Positive: \n",tp_Projekte)
print("\n\nThe following projects are classified as False Positive: \n",fp_Projekte)
print("\n\nThe following projects are classified as False Negative: \n",fn_Projekte)
print("\n\nThe following projects are classified as True Negative: \n",tn_Projekte)

In [None]:
#7. Linear Support Vector Machine

#Initialize the model:
clf_7 = LinearSVC() #Initialisiere das Modell
param_grid = {'penalty': ["l1", "l2"], 'C': np.arange(1, 10)}
clf_7 = GridSearchCV(clf_7, param_grid, cv=5, verbose=1)

#Train the model:
clf_7.fit(X_train, y_train)

#Print best parameters:
print("Best parameters: ",clf_7.best_params_,"\n")

#Calculate confusion matrix:
pred_y = clf_7.predict(X_test)      
y_pred = (pred_y > 0.5)
tp, fp, fn, tn = confusion_matrix(y_test, y_pred).ravel()
print("True Positives (TP) :  ",tp,"\nFalse Positives (FP):   ",fp,"\nFalse Negatives (FN):  ",fn,"\nTrue Negatives (TN) :   ",tn,"\n")

#Show Confusion Matrix:
url = 'https://glassboxmedicine.files.wordpress.com/2019/02/confusion-matrix.png'
IPython.display.Image(url, width = 800)

#Calculate scores:
score = accuracy_score(y_test, y_pred) #In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.
print("Accuracy Score:        ",score*100,"%\n")
precision = tp/(tp+fp)
print("Precision:             ",precision*100,"%")
recall = tp/(tp+fn)
print("Recall:               ",recall*100,"%")
f_one = 2*(precision*recall)/(precision+recall)
print("F1-Score:              ",f_one*100,"%")

#Assignment of project names to classifications
tp_Projekte=[]
fp_Projekte=[]
fn_Projekte=[]
tn_Projekte=[]

for j in range(0,len(pred_y)):
    if pred_y[j]+y_test[j]==0:
        #print('True Positive')
        tp_Projekte.append(Projekte_test.iloc[j])
    elif pred_y[j]+y_test[j]==2:
        #print('True Negative')
        tn_Projekte.append(Projekte_test.iloc[j])
    elif  pred_y[j]+y_test[j]==1 and pred_y[j]==0:
        #print("False Negativ")
        fn_Projekte.append(Projekte_test.iloc[j])
    elif  pred_y[j]+y_test[j]==1 and pred_y[j]==1:
        #print("False Positive")
        fp_Projekte.append(Projekte_test.iloc[j])
        
print("\n\nThe following projects are classified as True Positive: \n",tp_Projekte)
print("\n\nThe following projects are classified as False Positive: \n",fp_Projekte)
print("\n\nThe following projects are classified as False Negative: \n",fn_Projekte)
print("\n\nThe following projects are classified as True Negative: \n",tn_Projekte)

In [None]:
#8. Stochastic Gradient Descent Linear Support Vector Machine

#Initialize the model:
clf_8 = SGDClassifier(learning_rate="invscaling", eta0=0.1, shuffle=True, n_jobs=-1) #Initialisiere das Modell
param_grid = {'loss': ["hinge", "log", "modified_huber", "squared_hinge", "perceptron", "squared_loss", "huber", "epsilon_insensitive", "squared_epsilon_insensitive"], 'penalty': ["l1", "l2"], 'alpha': [0.00001, 0.0001, 0.001, 0.1]}
clf_8 = GridSearchCV(clf_8, param_grid, cv=5, verbose=1)

#Train the model:
clf_8.fit(X_train, y_train)

#Print best parameters:
print("Best parameters: ",clf_8.best_params_,"\n")

#Calculate confusion matrix:
pred_y = clf_8.predict(X_test)       
y_pred = (pred_y > 0.5)
tp, fp, fn, tn = confusion_matrix(y_test, y_pred).ravel()
print("True Positives (TP) :  ",tp,"\nFalse Positives (FP):   ",fp,"\nFalse Negatives (FN):  ",fn,"\nTrue Negatives (TN) :   ",tn,"\n")

#Show Confusion Matrix:
url = 'https://glassboxmedicine.files.wordpress.com/2019/02/confusion-matrix.png'
IPython.display.Image(url, width = 800)

#Calculate scores:
score = accuracy_score(y_test, y_pred) #In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.
print("Accuracy Score:        ",score*100,"%\n")
precision = tp/(tp+fp)
print("Precision:             ",precision*100,"%")
recall = tp/(tp+fn)
print("Recall:               ",recall*100,"%")
f_one = 2*(precision*recall)/(precision+recall)
print("F1-Score:              ",f_one*100,"%")

#Assignment of project names to classifications
tp_Projekte=[]
fp_Projekte=[]
fn_Projekte=[]
tn_Projekte=[]

for j in range(0,len(pred_y)):
    if pred_y[j]+y_test[j]==0:
        #print('True Positive')
        tp_Projekte.append(Projekte_test.iloc[j])
    elif pred_y[j]+y_test[j]==2:
        #print('True Negative')
        tn_Projekte.append(Projekte_test.iloc[j])
    elif  pred_y[j]+y_test[j]==1 and pred_y[j]==0:
        #print("False Negativ")
        fn_Projekte.append(Projekte_test.iloc[j])
    elif  pred_y[j]+y_test[j]==1 and pred_y[j]==1:
        #print("False Positive")
        fp_Projekte.append(Projekte_test.iloc[j])
        
print("\n\nThe following projects are classified as True Positive: \n",tp_Projekte)
print("\n\nThe following projects are classified as False Positive: \n",fp_Projekte)
print("\n\nThe following projects are classified as False Negative: \n",fn_Projekte)
print("\n\nThe following projects are classified as True Negative: \n",tn_Projekte)

In [None]:
#9. Random Forest Classifier

#Initialize the model:
clf_9 = RandomForestClassifier(max_depth=2, random_state=0, n_jobs=-1)
param_grid = {'n_estimators': [50, 100, 200, 400], 'criterion': ["gini", "entropy"], 'max_features': ["auto", "sqrt", "log2", "int", "float"]}
clf_9 = GridSearchCV(clf_9, param_grid, cv=5, verbose=1)

#Train the model:
clf_9.fit(X_train, y_train)

#Print best parameters:
print("Best parameters: ",clf_9.best_params_,"\n")

#Calculate confusion matrix:
pred_y = clf_9.predict(X_test)       
y_pred = (pred_y > 0.5)
tp, fp, fn, tn = confusion_matrix(y_test, y_pred).ravel()
print("True Positives (TP) :  ",tp,"\nFalse Positives (FP):   ",fp,"\nFalse Negatives (FN):  ",fn,"\nTrue Negatives (TN) :   ",tn,"\n")

#Show Confusion Matrix:
url = 'https://glassboxmedicine.files.wordpress.com/2019/02/confusion-matrix.png'
IPython.display.Image(url, width = 800)

#Calculate scores:
score = accuracy_score(y_test, y_pred) #In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.
print("Accuracy Score:        ",score*100,"%\n")
precision = tp/(tp+fp)
print("Precision:             ",precision*100,"%")
recall = tp/(tp+fn)
print("Recall:               ",recall*100,"%")
f_one = 2*(precision*recall)/(precision+recall)
print("F1-Score:              ",f_one*100,"%")

#Assignment of project names to classifications
tp_Projekte=[]
fp_Projekte=[]
fn_Projekte=[]
tn_Projekte=[]

for j in range(0,len(pred_y)):
    if pred_y[j]+y_test[j]==0:
        #print('True Positive')
        tp_Projekte.append(Projekte_test.iloc[j])
    elif pred_y[j]+y_test[j]==2:
        #print('True Negative')
        tn_Projekte.append(Projekte_test.iloc[j])
    elif  pred_y[j]+y_test[j]==1 and pred_y[j]==0:
        #print("False Negativ")
        fn_Projekte.append(Projekte_test.iloc[j])
    elif  pred_y[j]+y_test[j]==1 and pred_y[j]==1:
        #print("False Positive")
        fp_Projekte.append(Projekte_test.iloc[j])
        
print("\n\nThe following projects are classified as True Positive: \n",tp_Projekte)
print("\n\nThe following projects are classified as False Positive: \n",fp_Projekte)
print("\n\nThe following projects are classified as False Negative: \n",fn_Projekte)
print("\n\nThe following projects are classified as True Negative: \n",tn_Projekte)

In [None]:
#10. K Nearest Neighbors Classifier

#Definition of DTW function (Source: https://gist.github.com/nikolasrieble)
def DTW(a, b):  
    an = a.size
    bn = b.size
    pointwise_distance = distance.cdist(a.reshape(-1,1),b.reshape(-1,1))
    cumdist = np.matrix(np.ones((an+1,bn+1)) * np.inf)
    cumdist[0,0] = 0

    for ai in range(an):
        for bi in range(bn):
            minimum_cost = np.min([cumdist[ai, bi+1],
                                   cumdist[ai+1, bi],
                                   cumdist[ai, bi]])
            cumdist[ai+1, bi+1] = pointwise_distance[ai,bi] + minimum_cost

    return cumdist[an, bn]

#Initialize the model:
clf_10 = KNeighborsClassifier(n_jobs=-1) #Initialisiere das Modell
param_grid = {'n_neighbors': np.arange(1, 6), 'weights': ["uniform", "distance"], 'leaf_size': [15,30,45], 'metric': [DTW, "minkowski"], 'p': np.arange(1,3)}
clf_10 = GridSearchCV(clf_10, param_grid, cv=5, verbose=1)

#Train the model:
clf_10.fit(X_train, y_train)

#Print best parameters:
print("Best parameters: ",clf_10.best_params_,"\n")

#Calculate confusion matrix:
pred_y = clf_10.predict(X_test)   
y_pred = (pred_y > 0.5)
tp, fp, fn, tn = confusion_matrix(y_test, y_pred).ravel()
print("True Positives (TP) :  ",tp,"\nFalse Positives (FP):   ",fp,"\nFalse Negatives (FN):  ",fn,"\nTrue Negatives (TN) :   ",tn,"\n")

#Show Confusion Matrix:
url = 'https://glassboxmedicine.files.wordpress.com/2019/02/confusion-matrix.png'
IPython.display.Image(url, width = 800)

#Calculate scores:
score = accuracy_score(y_test, y_pred) #In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.
print("Accuracy Score:        ",score*100,"%\n")
precision = tp/(tp+fp)
print("Precision:             ",precision*100,"%")
recall = tp/(tp+fn)
print("Recall:               ",recall*100,"%")
f_one = 2*(precision*recall)/(precision+recall)
print("F1-Score:              ",f_one*100,"%")

#Assignment of project names to classifications
tp_Projekte=[]
fp_Projekte=[]
fn_Projekte=[]
tn_Projekte=[]

for j in range(0,len(pred_y)):
    if pred_y[j]+y_test[j]==0:
        #print('True Positive')
        tp_Projekte.append(Projekte_test.iloc[j])
    elif pred_y[j]+y_test[j]==2:
        #print('True Negative')
        tn_Projekte.append(Projekte_test.iloc[j])
    elif  pred_y[j]+y_test[j]==1 and pred_y[j]==0:
        #print("False Negativ")
        fn_Projekte.append(Projekte_test.iloc[j])
    elif  pred_y[j]+y_test[j]==1 and pred_y[j]==1:
        #print("False Positive")
        fp_Projekte.append(Projekte_test.iloc[j])
        
print("\n\nThe following projects are classified as True Positive: \n",tp_Projekte)
print("\n\nThe following projects are classified as False Positive: \n",fp_Projekte)
print("\n\nThe following projects are classified as False Negative: \n",fn_Projekte)
print("\n\nThe following projects are classified as True Negative: \n",tn_Projekte)

In [None]:
#11. Long-Short-Time-Memory Network (Baseline)

#Define neural network model creation function
def create_model(layers, activation):
    model = tf.keras.models.Sequential()
    for i, nodes in enumerate(layers):
        if i==0:
            model.add(tf.keras.layers.Embedding(input_dim=X_train.shape[0], output_dim=X_train.shape[0]))
        else:
            model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(nodes, activation='relu', return_sequences=True)))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

#Initialize model
clf_11 = KerasClassifier(build_fn=create_model, verbose=1)      
reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.5, patience=50, min_lr=0.001) #https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/ReduceLROnPlateau

#Define parameter spaces
layers = [(100, 100, 100, 100)]
activations = ['relu']

#Create parameter grid:
param_grid = dict(layers=layers, activation=activations, batch_size=[32], epochs=[50])
clf_11 = GridSearchCV(estimator=clf_11, param_grid=param_grid, cv=5)
hist = clf_11.fit(X_train, y_train, validation_data=(X_test, y_test), verbose=1, callbacks=[reduce_lr])

#Print best parameters:
print("Best parameters: ",hist.best_params_)

#Calculate confusion matrix

#Predict y:
pred_y = clf_11.predict(X_test)
#print('pred_y Shape:', pred_y.shape) #Print Shape of pred_y (for debuging)

#Reshape pred_y to 2D:
pred_y = pred_y[:,0:1]
#print('pred_y Shape:', pred_y.shape) #Print Shape of pred_y (for debuging)
pred_y = np.reshape(pred_y,(30,1))
#print('pred_y Shape:', pred_y.shape) #Print Shape of pred_y (for debuging)

y_pred = (pred_y > 0.5)

#Calculate confusion matrix:
cm = confusion_matrix(y_test, y_pred)
tp, fp, fn, tn = confusion_matrix(y_test, y_pred).ravel()
print("True Positives (TP) :  ",tp,"\nFalse Positives (FP):   ",fp,"\nFalse Negatives (FN):  ",fn,"\nTrue Negatives (TN) :   ",tn,"\n")

#Show Confusion Matrix:
url = 'https://glassboxmedicine.files.wordpress.com/2019/02/confusion-matrix.png'
IPython.display.Image(url, width = 800)

#Calculate scores:
score = accuracy_score(y_test, y_pred) #In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.
print("Accuracy Score:        ",score*100,"%\n")
precision = tp/(tp+fp)
print("Precision:             ",precision*100,"%")
recall = tp/(tp+fn)
print("Recall:               ",recall*100,"%")
f_one = 2*(precision*recall)/(precision+recall)
print("F1-Score:              ",f_one*100,"%")

#Assignment of project names to classifications
tp_Projekte=[]
fp_Projekte=[]
fn_Projekte=[]
tn_Projekte=[]

for j in range(0,len(pred_y)):
    if pred_y[j]+y_test[j]==0:
        #print('True Positive')
        tp_Projekte.append(Projekte_test.iloc[j])
    elif pred_y[j]+y_test[j]==2:
        #print('True Negative')
        tn_Projekte.append(Projekte_test.iloc[j])
    elif  pred_y[j]+y_test[j]==1 and pred_y[j]==0:
        #print("False Negativ")
        fn_Projekte.append(Projekte_test.iloc[j])
    elif  pred_y[j]+y_test[j]==1 and pred_y[j]==1:
        #print("False Positive")
        fp_Projekte.append(Projekte_test.iloc[j])
        
print("\n\nThe following projects are classified as True Positive: \n",tp_Projekte)
print("\n\nThe following projects are classified as False Positive: \n",fp_Projekte)
print("\n\nThe following projects are classified as False Negative: \n",fn_Projekte)
print("\n\nThe following projects are classified as True Negative: \n",tn_Projekte)