In [1]:
import numpy as np

import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix

from sklearn.metrics import classification_report

from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import accuracy_score

from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression

from sklearn.naive_bayes import GaussianNB

from sklearn import metrics

from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier



#Load the dataset

df = pd.read_csv('liver_disease_.csv')

print(df.shape)



df= df.dropna()

print(df.shape)



df["Gender"] = df["Gender"].astype('category').cat.codes

df["Dataset"] = np.where(df['Dataset'].str.contains('Yes'), 1, 0)



#Let's create numpy arrays for features and target

X = df.drop('Dataset',axis=1).values

y = df['Dataset'].values



#Split the dataset into train and test with stratification

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42, stratify=y)



# a) Model1 = Logistic Regression, Model2 = Naïve Bayes and Model3 = Polynomial SVM

model1=LogisticRegression()

model1.fit(X_train,y_train)

mod1_y_pred=model1.predict(X_test)

print("LR:",accuracy_score(y_test, mod1_y_pred))



model2= GaussianNB()

model2.fit(X_train,y_train)

mod2_y_pred=model1.predict(X_test)

print("NB:",accuracy_score(y_test, mod2_y_pred))



model3 = SVC(kernel='poly', degree=8)

model3.fit(X_train, y_train)

mod3_y_pred=model1.predict(X_test)

print("Poly kernel SVM Score : ", accuracy_score(y_test, mod3_y_pred))



# a) Model1 = Logistic Regression, Model2 = Naïve Bayes and Model3 = Polynomial SVM with AdaBoostClassifier

def adaReport(classifierName, classifier):

    ada = AdaBoostClassifier(classifier, algorithm='SAMME')

    ada.fit(X_train, y_train)

    y_pred=ada.predict(X_test)

    print("AdaBoost with ",classifierName, ", classifier score:",accuracy_score(y_test, y_pred))

    print("Confusion Matrix of ", classifierName, " with AdaBoost:")

    print(confusion_matrix(y_test,y_pred))

    print(classification_report(y_test,y_pred))



adaReport("Logistic Regression", model1)

adaReport("Naïve Bayes", model2)

adaReport("Polynomial SVM", model3)



# b)	Implement Random Forest

rdf=RandomForestClassifier()

rdf.fit(X_train,y_train)

y_pred=rdf.predict(X_test)

print("RDF classifier Score:",accuracy_score(y_test, y_pred))



# b)	Implement Random Forest (with Grid Search CV)

rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True)

param_grid = {

    'n_estimators': [50],

    'max_features': ['auto', 'sqrt', 'log2']

}



CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)

CV_rfc.fit(X_train,y_train)

y_pred=CV_rfc.predict(X_test)

print("RDF with GridSearchCV classifier Score:",accuracy_score(y_test, y_pred))



# c)	Implement XG Boost



model = XGBClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("XGBoost Classifier Score:",accuracy_score(y_test, y_pred))



# d)	Compare accuracy measures (Precision/Recall/F1/CM)

def report(classifierName, classifier ):

    y_pred_ = classifier.predict(X_test)

    print (classifierName, " Score : ",classifier.score(X_test,y_test))

    print("Confusion Matrix of ", classifierName, " :" )

    print(confusion_matrix(y_test,y_pred_))

    print(classification_report(y_test,y_pred_))



report("Logistic Regression without AdaBoost", model1)

report("Naïve Bayes without AdaBoost", model2)

report("Polynomial SVM without AdaBoost", model3)

report("RDF without GridSearchCV", rdf)

report("RDF with GridSearchCV", CV_rfc)

report("XGBoost", model)







(583, 11)
(579, 11)




LR: 0.6896551724137931
NB: 0.6896551724137931
Poly kernel SVM Score :  0.6896551724137931
AdaBoost with  Logistic Regression , classifier score: 0.7183908045977011
Confusion Matrix of  Logistic Regression  with AdaBoost:
[[  2  48]
 [  1 123]]
              precision    recall  f1-score   support

           0       0.67      0.04      0.08        50
           1       0.72      0.99      0.83       124

   micro avg       0.72      0.72      0.72       174
   macro avg       0.69      0.52      0.45       174
weighted avg       0.70      0.72      0.62       174

AdaBoost with  Naïve Bayes , classifier score: 0.5862068965517241
Confusion Matrix of  Naïve Bayes  with AdaBoost:
[[49  1]
 [71 53]]
              precision    recall  f1-score   support

           0       0.41      0.98      0.58        50
           1       0.98      0.43      0.60       124

   micro avg       0.59      0.59      0.59       174
   macro avg       0.69      0.70      0.59       174
weighted avg       0.82



RDF classifier Score: 0.6839080459770115
RDF with GridSearchCV classifier Score: 0.6839080459770115
XGBoost Classifier Score: 0.6839080459770115
Logistic Regression without AdaBoost  Score :  0.6896551724137931
Confusion Matrix of  Logistic Regression without AdaBoost  :
[[  5  45]
 [  9 115]]
              precision    recall  f1-score   support

           0       0.36      0.10      0.16        50
           1       0.72      0.93      0.81       124

   micro avg       0.69      0.69      0.69       174
   macro avg       0.54      0.51      0.48       174
weighted avg       0.61      0.69      0.62       174

Naïve Bayes without AdaBoost  Score :  0.5862068965517241
Confusion Matrix of  Naïve Bayes without AdaBoost  :
[[49  1]
 [71 53]]
              precision    recall  f1-score   support

           0       0.41      0.98      0.58        50
           1       0.98      0.43      0.60       124

   micro avg       0.59      0.59      0.59       174
   macro avg       0.69      0