In [1]:
%matplotlib notebook

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn.metrics import precision_recall_curve, confusion_matrix

from sklearn.preprocessing import Imputer

In [2]:
hepatitis_data = pd.read_csv("dataset_55_hepatitis.csv")
# print(hepatitis_data)

In [3]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

In [4]:
replacements = {'no': 0,
               'yes': 1,
               'DIE': 0,
               'LIVE': 1,
               '?': np.nan,
               'female': 0,
               'male': 1}

hepatitis_data.replace(replacements, inplace = True)
hepatitis_data = hepatitis_data.astype(float)
hepatitis_data[['ALBUMIN', 'ALK_PHOSPHATE', 'BILIRUBIN', 'SGOT']] = hepatitis_data[['ALBUMIN','ALK_PHOSPHATE', 'BILIRUBIN', 'SGOT']].applymap(np.log)


In [5]:
hepatitis_data.isnull().sum()

AGE                 0
SEX                 0
STEROID             1
ANTIVIRALS          0
FATIGUE             1
MALAISE             1
ANOREXIA            1
LIVER_BIG          10
LIVER_FIRM         11
SPLEEN_PALPABLE     5
SPIDERS             5
ASCITES             5
VARICES             5
BILIRUBIN           6
ALK_PHOSPHATE      29
SGOT                4
ALBUMIN            16
PROTIME            67
HISTOLOGY           0
Class               0
dtype: int64

In [6]:
x = hepatitis_data.iloc[:, hepatitis_data.columns != 'Class']
y = hepatitis_data.iloc[:, hepatitis_data.columns == 'Class']


In [7]:
imp = Imputer(missing_values = 'NaN', strategy = "most_frequent", axis = 0)
imp = imp.fit(x)

x = imp.transform(x)





In [8]:
print x

[[ 30.           1.           0.         ...   1.38629436 100.
    0.        ]
 [ 50.           0.           0.         ...   1.25276297 100.
    0.        ]
 [ 78.           0.           1.         ...   1.38629436 100.
    0.        ]
 ...
 [ 61.           0.           0.         ...   1.41098697 100.
    1.        ]
 [ 53.           1.           0.         ...   1.41098697  48.
    1.        ]
 [ 43.           0.           1.         ...   1.13140211  42.
    1.        ]]


In [9]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(x, y)

print X_res.shape
print y_res.shape

(246, 19)
(246,)


  y = column_or_1d(y, warn=True)


In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X_res, y_res, test_size = 0.2, 
                                                    random_state = 42)

In [11]:
#Y_train = Y_train.values.ravel()
#Y_test = Y_test.values.ravel()
print(Y_train)
print(Y_test)

[0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1. 0. 0.
 1. 1. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1.
 1. 0. 0. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 0. 1. 1. 1. 0. 0.
 1. 0. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 1. 0. 1. 1. 1. 1.
 0. 0. 0. 1. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1.
 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0.
 1. 1. 0. 1.]
[1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 1. 0.
 0. 0. 0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0.
 1. 0.]


In [12]:
imp = Imputer(missing_values = 'NaN', strategy = "mean", axis = 0)
imp = imp.fit(X_train)

X_train_imp = imp.transform(X_train)
X_test_imp = imp.transform(X_test)



In [13]:
fit_random_forest = RandomForestClassifier(random_state = 42)

fit_random_forest.fit(X_train_imp, Y_train);



In [14]:
X_test_imp = imp.transform(X_test)

y_predicted = fit_random_forest.predict(X_test_imp)

In [15]:
accuracy = accuracy_score(Y_test, y_predicted)*100
print(round(accuracy, 2), '%')

(94.0, '%')


In [16]:
%matplotlib notebook

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn.metrics import precision_recall_curve, confusion_matrix

from sklearn.preprocessing import Imputer

In [17]:
hepatitis_data = pd.read_csv("dataset_55_hepatitis.csv")
# print(hepatitis_data)
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

In [18]:
replacements = {'no': 0,
               'yes': 1,
               'DIE': 0,
               'LIVE': 1,
               '?': np.nan,
               'female': 0,
               'male': 1}

hepatitis_data.replace(replacements, inplace = True)
hepatitis_data = hepatitis_data.astype(float)
hepatitis_data[['ALBUMIN', 'ALK_PHOSPHATE', 'BILIRUBIN', 'SGOT']] = hepatitis_data[['ALBUMIN','ALK_PHOSPHATE', 'BILIRUBIN', 'SGOT']].applymap(np.log)


In [19]:
hepatitis_data.isnull().sum()

AGE                 0
SEX                 0
STEROID             1
ANTIVIRALS          0
FATIGUE             1
MALAISE             1
ANOREXIA            1
LIVER_BIG          10
LIVER_FIRM         11
SPLEEN_PALPABLE     5
SPIDERS             5
ASCITES             5
VARICES             5
BILIRUBIN           6
ALK_PHOSPHATE      29
SGOT                4
ALBUMIN            16
PROTIME            67
HISTOLOGY           0
Class               0
dtype: int64

In [20]:
x = hepatitis_data.iloc[:, hepatitis_data.columns != 'Class']
y = hepatitis_data.iloc[:, hepatitis_data.columns == 'Class']


In [21]:
imp = Imputer(missing_values = 'NaN', strategy = "most_frequent", axis = 0)
imp = imp.fit(x)

x = imp.transform(x)



In [22]:
from sklearn.decomposition import PCA
pca = PCA(n_components=15)
x=pca.fit_transform(x)

In [23]:
print x.shape

(155, 15)


In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.2, 
                                                    random_state = 42)

In [25]:
Y_train = Y_train.values.ravel()
Y_test = Y_test.values.ravel()

In [26]:
imp = Imputer(missing_values = 'NaN', strategy = "most_frequent", axis = 0)
imp = imp.fit(X_train)

X_train_imp = imp.transform(X_train)

fit_random_forest = RandomForestClassifier(random_state = 42)

fit_random_forest.fit(X_train_imp, Y_train);




In [27]:
X_test_imp = imp.transform(X_test)

y_predicted = fit_random_forest.predict(X_test_imp)

In [28]:
accuracy = accuracy_score(Y_test, y_predicted)*100
print(round(accuracy, 2), '%')

(83.87, '%')
