In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
performanceFile = open('drugs-performance.txt', "a")

def printPerformance(actual, predicted, classifier, display, bestParams=None, params = None):
    confusionMatrix = metrics.confusion_matrix(actual,predicted,labels=['drugA','drugB','drugC','drugX','drugY'])
    accuracy = metrics.accuracy_score(actual,predicted)
    macroF1 = metrics.f1_score(actual,predicted,labels=['drugA','drugB','drugC','drugX','drugY'], average="macro")
    weightedF1 = metrics.f1_score(actual,predicted,labels=['drugA','drugB','drugC','drugX','drugY'], average="weighted")
    TP = np.diagonal(confusionMatrix)
    FP = np.sum(confusionMatrix,axis=0) - TP
    FN = np.sum(confusionMatrix,axis=1) - TP

    TN = []
    for i in range(5):
        temp = np.delete(confusionMatrix,i,0)
        temp = np.delete(temp,i,1)
        TN.append(sum(sum(temp)))

    precision = np.round_(TP/(TP+FP),2)
    recall =  np.round_(TP/(TP+FN))
    F1 = np.round_(2 * (precision * recall) / (precision + recall),2)

    if display :
        performanceFile.write('----------------------------------------------\n'+ classifier + '\n'+ '----------------------------------------------\n\n')
        if params :
            performanceFile.write(
                'Hyperparameters options:' + str(params) + '\n'+         
                'Best hyperparameters found by the gridsearch:' + str(bestParams) + '\n\n'
            )

        performanceFile.writelines([
            'ConfusionMatrix:\n\n',
            str(confusionMatrix) + '\n\n',
            'DrugA => Precision=' + str(precision[0]) + ', Recall=' + str(recall[0]) + ', F1=' + str(F1[0]),
            '\nDrugB => Precision=' + str(precision[1]) + ', Recall=' + str(recall[1]) + ', F1=' + str(F1[1]),
            '\nDrugC => Precision=' + str(precision[2]) + ', Recall=' + str(recall[2]) + ', F1=' + str(F1[2]),
            '\nDrugX => Precision=' + str(precision[3]) + ', Recall=' + str(recall[3]) + ', F1=' + str(F1[3]),
            '\nDrugY => Precision=' + str(precision[4]) + ', Recall=' + str(recall[4]) + ', F1=' + str(F1[4]) + '\n\n',
            'Accuracy=' + str(accuracy) + '\n',
            'Macro-average F1=' + str(macroF1) + '\n',
            'Weighted-average F1=' + str(weightedF1),
            '\n\n\n'
        ])
    
    return [accuracy,macroF1,weightedF1]

def printStats(model,avg,avg2,avg3,std,std2,std3):
    performanceFile.writelines([
        '----------------------------------------------\n',
        model + '\n',
        '----------------------------------------------\n',
        'Average Accuracy = ' + str(avg) + '\t Standard deviation = '  + str(std) + '\n',
        'Average Macro-average F1 = ' + str(avg2) + '\t Standard deviation Macro-average F1 = '  + str(std2) + '\n',
        'Average Weighted-average F1 = ' + str(avg3) + '\t Standard deviation Weighted-average F1 = '  + str(std3) + '\n\n\n',
    ])

# a) Gaussian Naive Bayes run
def gaussianNB(X_train, X_test, y_train, y_test, display):
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    y_predicted = gnb.predict(X_test)
    return printPerformance(y_test, y_predicted, 'a) Gaussian Naive Bayes', display)

# b) Base Decision Tree    
def baseDecisionTree(X_train, X_test, y_train, y_test, display):
    dt = DecisionTreeClassifier()
    dt.fit(X_train, y_train)
    y_predicted = dt.predict(X_test)
    return printPerformance(y_test, y_predicted, 'b) Base Decision Tree', display)

# c) Top Decision Tree
def topDecisionTree(X_train, X_test, y_train, y_test, display):
    dtc = DecisionTreeClassifier()
    params = {'min_samples_split' : [2,3,4], 'criterion' : ['gini','entropy'], 'max_depth' : [20,90]}
    grid_search = GridSearchCV(estimator = dtc , param_grid = params)
    grid_search.fit(X_train, y_train)
    bestParams = grid_search.best_params_ 
    y_predicted = grid_search.predict(X_test)
    return printPerformance(y_test, y_predicted, 'c) Top Decision Tree', display, bestParams,params)

# d) Perceptron
def perceptron(X_train, X_test, y_train, y_test, display):
    pct = Perceptron()
    pct.fit(X_train, y_train)
    y_predicted = pct.predict(X_test)
    return printPerformance(y_test, y_predicted, 'd) Perceptron', display)

# e) Multi-Layered Perceptron
def mLPerceptron(X_train, X_test, y_train, y_test, display):
    mlp = MLPClassifier(hidden_layer_sizes=(100,),activation='logistic', solver='sgd')
    mlp.fit(X_train, y_train)
    y_predicted = mlp.predict(X_test)
    return printPerformance(y_test, y_predicted, 'e) Multi-Layered Perceptron', display)

# f) Top Multi-Layered Perceptron
def topMLPerceptron(X_train, X_test, y_train, y_test, display):
    mlp = MLPClassifier()
    params = {
        'hidden_layer_sizes': [(30,50,), (10,10,10,)],
        'activation': ['tanh', 'identity','logistic','relu'],
        'solver': ['sgd', 'adam'],
    }
    grid_search = GridSearchCV(estimator = mlp , param_grid = params)
    grid_search.fit(X_train, y_train)
    bestParams = grid_search.best_params_ 
    y_predicted = grid_search.predict(X_test)
    return printPerformance(y_test, y_predicted, 'f) Top Multi-Layered Perceptron', display, bestParams, params)

# Step 3) Retrieving data and creating distribution figure
df = pd.read_csv('./drug200.csv')
axes = df.Drug.value_counts().plot(kind='bar', title='Drug distribution')
axes.set_xlabel("Drug")
axes.set_ylabel("Nb Instances")
plt.savefig('drug-distribution.pdf')

# Step 4) Converting to numerical
le = LabelEncoder()
df.Sex = le.fit_transform(df.Sex)
df.BP = le.fit_transform(df.BP)
df.Cholesterol = le.fit_transform(df.Cholesterol)
y = df.pop('Drug')

# Step 5) Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(df, y)

# Step 6) and Step 7)
gaussianNB(X_train, X_test, y_train, y_test, True)
baseDecisionTree(X_train, X_test, y_train, y_test, True)
topDecisionTree(X_train, X_test, y_train, y_test, True)
perceptron(X_train, X_test, y_train, y_test, True)
mLPerceptron(X_train, X_test, y_train, y_test, True)
topMLPerceptron(X_train, X_test, y_train, y_test, True)

# Step 8)
gaussianNBList = np.empty((10, 3))
baseDecisionTreeList = np.empty((10,3))
topDecisionTreeList = np.empty((10,3))
perceptronList = np.empty((10,3))
mLPerceptronList= np.empty((10,3))
topMLPerceptronList = np.empty((10,3))

for i in range(10):
    gaussianNBList[i] = gaussianNB(X_train, X_test, y_train, y_test,False)
    baseDecisionTreeList[i] = baseDecisionTree(X_train, X_test, y_train, y_test,False)
    topDecisionTreeList[i] = topDecisionTree(X_train, X_test, y_train, y_test,False)
    perceptronList[i] = perceptron(X_train, X_test, y_train, y_test,False)
    mLPerceptronList[i] = mLPerceptron(X_train, X_test, y_train, y_test,False)
    topMLPerceptronList[i] = topMLPerceptron(X_train, X_test, y_train, y_test,False)

performanceFile.writelines([
    '\n\n\n*********************************************\n',
            'Averages and Standard deviations \n',
    '*********************************************\n\n',
])

printStats("Gaussian Naive Bayes",np.average(gaussianNBList[:,0]),np.average(gaussianNBList[:,1]),np.average(gaussianNBList[:,2]),np.std(gaussianNBList[:,0]),np.std(gaussianNBList[:,1]),np.std(gaussianNBList[:,2]))          
printStats("Base Decision Tree",np.average(baseDecisionTreeList[:,0]),np.average(baseDecisionTreeList[:,1]),np.average(baseDecisionTreeList[:,2]),np.std(baseDecisionTreeList[:,0]),np.std(baseDecisionTreeList[:,1]),np.std(baseDecisionTreeList[:,2]))
printStats("Top Decision Tree",np.average(topDecisionTreeList[:,0]),np.average(topDecisionTreeList[:,1]),np.average(topDecisionTreeList[:,2]),np.std(topDecisionTreeList[:,0]),np.std(topDecisionTreeList[:,1]),np.std(topDecisionTreeList[:,2]))
printStats("Perceptron",np.average(perceptronList[:,0]),np.average(perceptronList[:,1]),np.average(perceptronList[:,2]),np.std(perceptronList[:,0]),np.std(perceptronList[:,1]) ,np.std(perceptronList[:,2]))
printStats("Multi-Layered Perceptron",np.average(mLPerceptronList[:,0]),np.average(mLPerceptronList[:,1]),np.average(mLPerceptronList[:,2]),np.std(mLPerceptronList[:,0]),np.std(mLPerceptronList[:,1]),np.std(mLPerceptronList[:,2]))
printStats("Top Multi-Layered Perceptron",np.average(topMLPerceptronList[:,0]),np.average(topMLPerceptronList[:,1]),np.average(gaussianNBList[:,2]),np.std(topMLPerceptronList[:,0]),np.std(topMLPerceptronList[:,1]),np.std(topMLPerceptronList[:,2]))

performanceFile.close()

