# Application of our Workflow onto two datasets

In [43]:
# Import useful functions and librairies
from functions import * #clean_file, normalize_data, train_test, trainSVM,getDataLabels,confusionMatrix,testSVM, validateModel, trainLogisticRegression, crossValidationProcedure, testLogReg
import numpy as np
%matplotlib inline

In [44]:
# Path to our datasets
BANKNOTE = "./data_banknote_authentication.txt"
KIDNEY = "./kidney_disease.csv"

## 1. Load and clean data
* Load and clean data
* Center and normalize data

In [45]:
banknote = clean_file(BANKNOTE)
kidney = clean_file(KIDNEY)

---START CLEANING :  ./data_banknote_authentication.txt ---
Toutes les valeurs manquantes ont été remplacées
La table est normalisée
---END CLEANING : ./data_banknote_authentication.txt ---

---START CLEANING :  ./kidney_disease.csv ---
Toutes les valeurs manquantes ont été remplacées
La table est normalisée
---END CLEANING : ./kidney_disease.csv ---



In [46]:
banknote.describe().loc[['mean','std'],:]

Unnamed: 0,0,1,2,3,4
mean,-2.35801e-16,-9.880338e-17,-3.130392e-16,-3.547535e-16,0.444606
std,1.0,1.0,1.0,1.0,0.497103


In [47]:
kidney.describe().loc[['mean','std'],:]

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
mean,199.5,-6.211698e-16,7.187306e-16,1.446767e-12,2.770006e-16,1.044165e-15,0.1175,0.19,0.105,0.055,...,15.7225,25.9,14.69,0.6325,0.6575,0.085,0.205,0.19,0.15,0.375
std,115.614301,1.0,1.0,1.0,1.0,1.0,0.322418,0.392792,0.306937,0.228266,...,10.905863,23.970533,16.030325,0.482728,0.47514,0.279231,0.404207,0.392792,0.357519,0.484729


We can see that numerical values are centered (mean = 0) and normalized (std = 1)

## 2. Split data into train and test

In [48]:
KIDNEY_DATA,KIDNEY_LABEL = getDataLabels(kidney)
BANKNOTE_DATA,BANKNOTE_LABEL = getDataLabels(banknote)

In [49]:
K_X_train,K_X_test,K_y_train,K_y_test = train_test(KIDNEY_DATA,KIDNEY_LABEL,.3,False)
B_X_train,B_X_test,B_y_train,B_y_test = train_test(BANKNOTE_DATA,BANKNOTE_LABEL,.3,False)

## 3. Train models

In [50]:
K_SVM = trainSVM(K_X_train,K_y_train)
B_SVM = trainSVM(B_X_train,B_y_train)


In [51]:

K_LOGISTIC = trainLogisticRegression(K_X_train,K_y_train)
B_LOGISTIC = trainLogisticRegression(B_X_train,B_y_train)

In [52]:
K_DF = trainDecisionForest(K_X_train,K_y_train, 100)
B_DF = trainDecisionForest(B_X_train,B_y_train, 100)

K_AB = trainAdaBoost(K_X_train,K_y_train, 100)
B_AB = trainAdaBoost(B_X_train,B_y_train, 100)

## 4. Cross Validation

In [53]:
K_SVM = crossValidationProcedure(K_SVM,K_X_train,K_y_train,"kernel",["linear","poly","rbf","sigmoid"])
B_SVM = crossValidationProcedure(B_SVM,B_X_train,B_y_train,"kernel",["linear","poly","rbf","sigmoid"])

best kernel = linear
best kernel = rbf


## 5. Test models

In [54]:
K_SVM_pred = testSVM(K_SVM,K_X_test)
B_SVM_pred = testSVM(B_SVM,B_X_test)

K_LOGISTIC_pred = testLogReg(K_LOGISTIC,K_X_test)
B_LOGISTIC_pred = testLogReg(B_LOGISTIC,B_X_test)

K_DF_pred = testDecisionForest(K_DF, K_X_test)
B_DF_pred = testDecisionForest(B_DF, B_X_test)

K_AB_pred = testAdaBoost(K_AB, K_X_test)
B_AB_pred = testAdaBoost(B_AB, B_X_test)

K_KM_pred = testKmeans(K_X_train, K_y_train, K_X_test)
B_KM_pred = testKmeans(B_X_train, B_y_train, B_X_test)

NameError: name 'KMeans' is not defined

## 6. Validate models

In [None]:
# Kidney SVM
confusionMatrix(K_y_test,K_SVM_pred,title="Confusion Matrix\nKydney-SVM")
validateModel(K_y_test,K_SVM_pred)

In [None]:
# Kidney Logistic
confusionMatrix(K_y_test,K_LOGISTIC_pred,title="Confusion Matrix\nKidney-Logistic Regression")
validateModel(K_y_test,K_LOGISTIC_pred)

In [None]:
# Kidney Decision Forest
confusionMatrix(K_y_test,K_DF_pred,title="Confusion Matrix\nKidney-Decision Forest")
validateModel(K_y_test,K_DF_pred)

In [None]:
# Kidney Ada Boost
confusionMatrix(K_y_test,K_AB_pred,title="Confusion Matrix\nKidney-Ada Boost")
validateModel(K_y_test,K_AB_pred)

In [None]:
# Kidney K-Means
# confusionMatrix(K_y_test,K_LOGISTIC_pred,title="Confusion Matrix") → EST CE QU'ON PEUT L'APPLIQUER SUR KMEANS ??
validateModel(K_y_test,K_KM_pred)

In [None]:
# Banknote SVM
confusionMatrix(B_y_test,B_SVM_pred,title="Confusion Matrix\nBanknote-SVM")
validateModel(B_y_test,B_SVM_pred)

In [None]:
# Banknote Logistic
confusionMatrix(B_y_test,B_LOGISTIC_pred,title="Confusion Matrix\nBanknote-Logistic Regression")
validateModel(B_y_test,B_LOGISTIC_pred)

In [None]:
# Banknote Decision Forest
confusionMatrix(B_y_test,B_DF_pred,title="Confusion Matrix\nBanknote-Decision Forest")
validateModel(B_y_test,B_DF_pred)

In [None]:
# Banknote Ada Boost
confusionMatrix(B_y_test,B_AB_pred,title="Confusion Matrix\nBanknote-Ada Boost")
validateModel(B_y_test,B_AB_pred)

In [None]:
# Banknote K-Means
validateModel(B_y_test,B_KM_pred)