In [1]:
%matplotlib notebook

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
hepatitis_data = pd.read_csv("dataset_55_hepatitis.csv")
# print(hepatitis_data)

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn.metrics import precision_recall_curve, confusion_matrix

from sklearn.preprocessing import Imputer

In [4]:
replacements = {'no': 0,
               'yes': 1,
               'DIE': 0,
               'LIVE': 1,
               '?': np.nan,
               'female': 0,
               'male': 1}

hepatitis_data.replace(replacements, inplace = True)
hepatitis_data = hepatitis_data.astype(float)
hepatitis_data[['ALBUMIN', 'ALK_PHOSPHATE', 'BILIRUBIN', 'SGOT']] = hepatitis_data[['ALBUMIN','ALK_PHOSPHATE', 'BILIRUBIN', 'SGOT']].applymap(np.log)


In [5]:
hepatitis_data.isnull().sum()

AGE                 0
SEX                 0
STEROID             1
ANTIVIRALS          0
FATIGUE             1
MALAISE             1
ANOREXIA            1
LIVER_BIG          10
LIVER_FIRM         11
SPLEEN_PALPABLE     5
SPIDERS             5
ASCITES             5
VARICES             5
BILIRUBIN           6
ALK_PHOSPHATE      29
SGOT                4
ALBUMIN            16
PROTIME            67
HISTOLOGY           0
Class               0
dtype: int64

In [6]:
# print(hepatitis_data)

In [7]:
x = hepatitis_data.iloc[:, hepatitis_data.columns != 'Class']
y = hepatitis_data.iloc[:, hepatitis_data.columns == 'Class']

In [8]:
# print(x)
# print(y)

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.2, 
                                                    random_state = 42)

In [10]:
Y_train = Y_train.values.ravel()
Y_test = Y_test.values.ravel()
print(Y_train)
print(Y_test)

[1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1.
 1. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0.
 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1.
 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 0. 1. 1. 1.
 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 0.
 0. 1. 1. 1.]
[1. 1. 0. 1. 0. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1.
 1. 0. 1. 1. 0. 1. 1.]


In [11]:
imp = Imputer(missing_values = 'NaN', strategy = "most_frequent", axis = 0)
imp = imp.fit(X_train)

X_train_imp = imp.transform(X_train)
X_test_imp = imp.transform(X_test)

# print(X_train_imp)
# print(X_test_imp)




In [12]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

In [13]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in learning_rates:
    gb = GradientBoostingClassifier(n_estimators=20, learning_rate = learning_rate, max_features=2, max_depth = 2, random_state = 0)
    gb.fit(X_train_imp, Y_train)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb.score(X_train_imp, Y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb.score(X_test_imp, Y_test)))
    print()

('Learning rate: ', 0.05)
Accuracy score (training): 0.839
Accuracy score (validation): 0.806
()
('Learning rate: ', 0.1)
Accuracy score (training): 0.919
Accuracy score (validation): 0.806
()
('Learning rate: ', 0.25)
Accuracy score (training): 0.944
Accuracy score (validation): 0.742
()
('Learning rate: ', 0.5)
Accuracy score (training): 0.968
Accuracy score (validation): 0.742
()
('Learning rate: ', 0.75)
Accuracy score (training): 1.000
Accuracy score (validation): 0.742
()
('Learning rate: ', 1)
Accuracy score (training): 1.000
Accuracy score (validation): 0.710
()


In [14]:
gb = GradientBoostingClassifier(n_estimators=20, learning_rate = 0.5, max_features=2, max_depth = 2, random_state = 0)
gb.fit(X_train_imp, Y_train)
predictions = gb.predict(X_test_imp)

print("Confusion Matrix:")
print(confusion_matrix(Y_test, predictions))
print()
print("Classification Report")
print(classification_report(Y_test, predictions))

Confusion Matrix:
[[ 1  6]
 [ 2 22]]
()
Classification Report
              precision    recall  f1-score   support

         0.0       0.33      0.14      0.20         7
         1.0       0.79      0.92      0.85        24

   micro avg       0.74      0.74      0.74        31
   macro avg       0.56      0.53      0.52        31
weighted avg       0.68      0.74      0.70        31



In [15]:
y_scores_gb = gb.decision_function(X_test_imp)
fpr_gb, tpr_gb, _ = roc_curve(Y_test, y_scores_gb)
roc_auc_gb = auc(fpr_gb, tpr_gb)

print("Area under ROC curve = {:0.2f}".format(roc_auc_gb))

Area under ROC curve = 0.69


In [16]:
fit_random_forest = RandomForestClassifier(random_state = 42)

fit_random_forest.fit(X_train_imp, Y_train);



In [17]:
X_test_imp = imp.transform(X_test)

y_predicted = fit_random_forest.predict(X_test_imp)

accuracy = accuracy_score(Y_test, y_predicted)*100
print(round(accuracy, 2), '%')

(74.19, '%')


In [18]:
from sklearn.metrics import classification_report
print classification_report(Y_test, y_predicted)

              precision    recall  f1-score   support

         0.0       0.40      0.29      0.33         7
         1.0       0.81      0.88      0.84        24

   micro avg       0.74      0.74      0.74        31
   macro avg       0.60      0.58      0.59        31
weighted avg       0.72      0.74      0.73        31



In [25]:
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train_imp, Y_train)

y_predicted = clf.predict(X_test_imp)

accuracy = accuracy_score(Y_test, y_predicted)*100
print(round(accuracy, 2), '%')


(61.29, '%')


In [27]:
from sklearn.metrics import classification_report
print classification_report(Y_test, y_predicted)

              precision    recall  f1-score   support

         0.0       0.14      0.14      0.14         7
         1.0       0.75      0.75      0.75        24

   micro avg       0.61      0.61      0.61        31
   macro avg       0.45      0.45      0.45        31
weighted avg       0.61      0.61      0.61        31

