<a href="https://colab.research.google.com/github/vigilant-umbrella/hcv-prediction/blob/main/hcv_model_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initialization

In [None]:
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# Data Preprocessing

In [None]:
%%shell
if ! [ -f "hcvdat0.csv" ]; then
    wget https://archive.ics.uci.edu/ml/machine-learning-databases/00571/hcvdat0.csv 
fi

--2021-08-12 14:55:35--  https://archive.ics.uci.edu/ml/machine-learning-databases/00571/hcvdat0.csv
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 46183 (45K) [application/x-httpd-php]
Saving to: ‘hcvdat0.csv’


2021-08-12 14:55:36 (348 KB/s) - ‘hcvdat0.csv’ saved [46183/46183]





In [None]:
data = pd.read_csv('hcvdat0.csv')

data = data[(data['Category']!='0=Blood Donor') & (data['Category']!='0s=suspect Blood Donor')].reset_index(drop=True)

X = data.drop(['Unnamed: 0', 'Category'], axis=1)
category = pd.DataFrame(data['Category'])

X = pd.concat([X, pd.get_dummies(X.Sex, drop_first=True)], axis=1)
X = X.drop(['Sex'], axis=1)

X['ALB'] = X['ALB'].fillna(X['ALB'].median())
X['ALP'] = X['ALP'].fillna(X['ALP'].median())
X['ALT'] = X['ALT'].fillna(X['ALT'].median())
X['CHOL'] = X['CHOL'].fillna(X['CHOL'].mean())
X['PROT'] = X['PROT'].fillna(X['PROT'].mean())

enc = OrdinalEncoder()
y = pd.DataFrame(enc.fit_transform(pd.DataFrame(category)), columns=['category'])

In [None]:
# For alpha=1.84 and beta=0.2
cols_to_drop = ['GGT', 'AST', 'CREA', 'PROT', 'm']

In [None]:
X_dropped = X.drop(cols_to_drop, axis=1)
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X_dropped)

# Utils

In [None]:
def generate_kfold_report(results):
    report = {}
    report['Accuracy'] = sum(results['test_accuracy'])/len(results['test_accuracy'])
    report['Precision'] = sum(results['test_precision_macro'])/len(results['test_precision_macro'])
    report['Recall'] = sum(results['test_recall_macro'])/len(results['test_recall_macro'])
    report['F1 Score'] = sum(results['test_f1_macro'])/len(results['test_f1_macro'])

    return report

# LogisticRegresssion

In [None]:
lr = LogisticRegression(
    C=1.25,
    fit_intercept=True,
    l1_ratio=0.5,
    max_iter=50,
    penalty='elasticnet',
    random_state=221,
    solver='saga'
    )

lrcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

lrcv_results = cross_validate(
    lr,
    scaled_X,
    y['category'],
    cv=lrcv,
    scoring=['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'],
    n_jobs=-1
    )

In [None]:
lr_kfold_report = generate_kfold_report(lrcv_results)
lr_kfold_report

{'Accuracy': 0.8035714285714285,
 'F1 Score': 0.7752380952380953,
 'Precision': 0.8083333333333332,
 'Recall': 0.7833333333333333}

# KNeighborsClassifier

In [None]:
knclf = KNeighborsClassifier(
    n_neighbors=10,
    algorithm='ball_tree',
    p=4
    )

kncv = StratifiedKFold(n_splits=10, shuffle=True, random_state=182)

kncv_results = cross_validate(
    knclf,
    scaled_X,
    y['category'],
    cv=kncv,
    scoring=['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'],
    n_jobs=-1
    )

In [None]:
kn_kfold_report = generate_kfold_report(kncv_results)
kn_kfold_report

{'Accuracy': 0.8160714285714284,
 'F1 Score': 0.8077777777777777,
 'Precision': 0.8583333333333336,
 'Recall': 0.8222222222222222}

# GaussianNB

In [None]:
gnb = GaussianNB(var_smoothing=1e-10)

gnbcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=302)

gnbcv_results = cross_validate(
    gnb,
    scaled_X,
    y['category'],
    cv=gnbcv,
    scoring=['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'],
    n_jobs=-1
    )

In [None]:
gnb_kfold_report = generate_kfold_report(gnbcv_results)
gnb_kfold_report

{'Accuracy': 0.7874999999999999,
 'F1 Score': 0.7725396825396825,
 'Precision': 0.836111111111111,
 'Recall': 0.7833333333333333}

# DecisionTreeClassifier

In [None]:
dtclf = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=7,
    max_features=None,
    min_samples_leaf=3,
    min_samples_split=0.4,
    random_state=559,
    splitter='random'
    )

dtcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=985)

dtcv_results = cross_validate(
    dtclf,
    scaled_X,
    y['category'],
    cv=dtcv,
    scoring=['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'],
    n_jobs=-1
    )

In [None]:
dt_kfold_report = generate_kfold_report(dtcv_results)
dt_kfold_report

{'Accuracy': 0.7482142857142857,
 'F1 Score': 0.7371428571428573,
 'Precision': 0.7833333333333332,
 'Recall': 0.7500000000000001}

# RandomForestClassifier

In [None]:
rfclf = RandomForestClassifier(
    criterion='gini',
    max_features='sqrt',
    min_samples_leaf=2,
    min_samples_split=5,
    n_estimators=50,
    random_state=67
    )

rfcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=309)

rfcv_results = cross_validate(
    rfclf,
    scaled_X,
    y['category'],
    cv=rfcv,
    scoring=['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'],
    n_jobs=-1
    )

In [None]:
rf_kfold_report = generate_kfold_report(rfcv_results)
rf_kfold_report

{'Accuracy': 0.8,
 'F1 Score': 0.7773015873015873,
 'Precision': 0.825,
 'Recall': 0.788888888888889}

# SVC

In [None]:
svc = SVC(
    C=0.1,
    coef0=0.3,
    degree=2,
    gamma='scale',
    kernel='poly',
    random_state=98,
    shrinking=True
    )

svccv = StratifiedKFold(n_splits=10, shuffle=True, random_state=911)

svccv_results = cross_validate(
    svc,
    scaled_X,
    y['category'],
    cv=svccv,
    scoring=['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'],
    n_jobs=-1
    )

In [None]:
svc_kfold_report = generate_kfold_report(svccv_results)
svc_kfold_report

{'Accuracy': 0.8178571428571427,
 'F1 Score': 0.7884920634920635,
 'Precision': 0.8505555555555556,
 'Recall': 0.8055555555555556}

# MLPClassifier

In [None]:
mlpclf = MLPClassifier(
    batch_size=8,
    activation='relu',
    early_stopping=False,
    hidden_layer_sizes=(32, 32, 32),
    max_iter=500,
    random_state=377,
    solver='lbfgs'
    )

mlpclfcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=496)

mlpclfcv_results = cross_validate(
    mlpclf,
    scaled_X,
    y['category'],
    cv=mlpclfcv,
    scoring=['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'],
    n_jobs=-1
    )

In [None]:
mlpclf_kfold_report = generate_kfold_report(mlpclfcv_results)
mlpclf_kfold_report

{'Accuracy': 0.8678571428571429,
 'F1 Score': 0.8404761904761905,
 'Precision': 0.8555555555555555,
 'Recall': 0.8555555555555557}

# Evaluating Results

In [None]:
pd.DataFrame(
    [
        dt_kfold_report,
        gnb_kfold_report,
        rf_kfold_report,
        lr_kfold_report,
        kn_kfold_report,
        svc_kfold_report,
        mlpclf_kfold_report
    ],
    index=[
        'Decision Tree',
        'Gaussian naive Bayes',
        'Random Forest',
        'Logistic Regression',
        'k-nearest neighbors',
        'Support Vector Machine',
        'Mutli-layer Perceptron'
    ]
)

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Decision Tree,0.748214,0.783333,0.75,0.737143
Gaussian naive Bayes,0.7875,0.836111,0.783333,0.77254
Random Forest,0.8,0.825,0.788889,0.777302
Logistic Regression,0.803571,0.808333,0.783333,0.775238
k-nearest neighbors,0.816071,0.858333,0.822222,0.807778
Support Vector Machine,0.817857,0.850556,0.805556,0.788492
Mutli-layer Perceptron,0.867857,0.855556,0.855556,0.840476
