<a href="https://colab.research.google.com/github/vigilant-umbrella/hcv-prediction/blob/main/hcv_model_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initialization

In [None]:
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# Data Preprocessing

In [None]:
%%shell
if ! [ -f "hcvdat0.csv" ]; then
    wget https://archive.ics.uci.edu/ml/machine-learning-databases/00571/hcvdat0.csv 
fi



In [None]:
data = pd.read_csv('hcvdat0.csv')

data = data[(data['Category']!='0=Blood Donor') & (data['Category']!='0s=suspect Blood Donor')].reset_index(drop=True)

X = data.drop(['Unnamed: 0', 'Category'], axis=1)
category = pd.DataFrame(data['Category'])

X = pd.concat([X, pd.get_dummies(X.Sex, drop_first=True)], axis=1)
X = X.drop(['Sex'], axis=1)

for col in X.columns:
    X[col] = X[col].fillna(sum(X[col].dropna())/len(X[col].dropna()))

enc = OrdinalEncoder()
y = pd.DataFrame(enc.fit_transform(pd.DataFrame(category)), columns=['category'])

In [None]:
# For alpha=0.3
cols_to_drop = ['ALT', 'AST', 'CREA', 'GGT', 'PROT', 'm']

In [None]:
X_dropped = X.drop(cols_to_drop, axis=1)
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X_dropped)

# Utils

In [None]:
def generate_kfold_report(results):
    report = {}
    report['Accuracy'] = sum(results['test_accuracy'])/len(results['test_accuracy'])
    report['Precision'] = sum(results['test_precision_macro'])/len(results['test_precision_macro'])
    report['Recall'] = sum(results['test_recall_macro'])/len(results['test_recall_macro'])
    report['F1 Score'] = sum(results['test_f1_macro'])/len(results['test_f1_macro'])

    return report

# LogisticRegresssion

In [None]:
lr = LogisticRegression(
    C=0.75,
    fit_intercept=True,
    l1_ratio=0,
    max_iter=50,
    penalty='l1',
    random_state=66,
    solver='saga'
    )

lrcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)

lrcv_results = cross_validate(
    lr,
    scaled_X,
    y['category'],
    cv=lrcv,
    scoring=['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'],
    n_jobs=-1
    )

In [None]:
lr_kfold_report = generate_kfold_report(lrcv_results)
lr_kfold_report

{'Accuracy': 0.7749999999999999,
 'F1 Score': 0.7321428571428572,
 'Precision': 0.7533333333333333,
 'Recall': 0.7555555555555555}

# KNeighborsClassifier

In [None]:
knclf = KNeighborsClassifier(
    n_neighbors=9,
    algorithm='ball_tree',
    p=1
    )

kncv = StratifiedKFold(n_splits=10, shuffle=True, random_state=92)

kncv_results = cross_validate(
    knclf,
    scaled_X,
    y['category'],
    cv=kncv,
    scoring=['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'],
    n_jobs=-1
    )

In [None]:
kn_kfold_report = generate_kfold_report(kncv_results)
kn_kfold_report

{'Accuracy': 0.7910714285714285,
 'F1 Score': 0.7684126984126985,
 'Precision': 0.7916666666666666,
 'Recall': 0.788888888888889}

# GaussianNB

In [None]:
gnb = GaussianNB(var_smoothing=1e-10)

gnbcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=446)

gnbcv_results = cross_validate(
    gnb,
    scaled_X,
    y['category'],
    cv=gnbcv,
    scoring=['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'],
    n_jobs=-1
    )

In [None]:
gnb_kfold_report = generate_kfold_report(gnbcv_results)
gnb_kfold_report

{'Accuracy': 0.8160714285714287,
 'F1 Score': 0.8163492063492063,
 'Precision': 0.8416666666666666,
 'Recall': 0.8277777777777778}

# DecisionTreeClassifier

In [None]:
dtclf = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=12,
    max_features='sqrt',
    min_samples_leaf=1,
    min_samples_split=0.2,
    random_state=652,
    splitter='random'
    )

dtcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=60)

dtcv_results = cross_validate(
    dtclf,
    scaled_X,
    y['category'],
    cv=dtcv,
    scoring=['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'],
    n_jobs=-1
    )

In [None]:
dt_kfold_report = generate_kfold_report(dtcv_results)
dt_kfold_report

{'Accuracy': 0.6410714285714285,
 'F1 Score': 0.578968253968254,
 'Precision': 0.5838888888888889,
 'Recall': 0.6111111111111112}

# RandomForestClassifier

In [None]:
rfclf = RandomForestClassifier(
    criterion='gini',
    max_features='sqrt',
    min_samples_leaf=0.1,
    min_samples_split=0.4,
    n_estimators=50,
    random_state=68
    )

rfcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=480)

rfcv_results = cross_validate(
    rfclf,
    scaled_X,
    y['category'],
    cv=rfcv,
    scoring=['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'],
    n_jobs=-1
    )

In [None]:
rf_kfold_report = generate_kfold_report(rfcv_results)
rf_kfold_report

{'Accuracy': 0.7857142857142856,
 'F1 Score': 0.7545238095238095,
 'Precision': 0.8,
 'Recall': 0.7666666666666666}

# SVC

In [None]:
svc = SVC(
    C=1.25,
    coef0=0.9,
    degree=6,
    gamma='auto',
    kernel='poly',
    random_state=0,
    shrinking=True
    )

svccv = StratifiedKFold(n_splits=10, shuffle=True, random_state=947)

svccv_results = cross_validate(
    svc,
    scaled_X,
    y['category'],
    cv=svccv,
    scoring=['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'],
    n_jobs=-1
    )

In [None]:
svc_kfold_report = generate_kfold_report(svccv_results)
svc_kfold_report

{'Accuracy': 0.8267857142857142,
 'F1 Score': 0.8099206349206349,
 'Precision': 0.8477777777777777,
 'Recall': 0.8111111111111111}

# MLPClassifier

In [None]:
mlpclf = MLPClassifier(
    batch_size=32,
    activation='relu',
    early_stopping=False,
    hidden_layer_sizes=(64, 64),
    max_iter=300,
    random_state=285,
    solver='adam'
    )

mlpclfcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=289)

mlpclfcv_results = cross_validate(
    mlpclf,
    scaled_X,
    y['category'],
    cv=mlpclfcv,
    scoring=['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'],
    n_jobs=-1
    )

In [None]:
mlpclf_kfold_report = generate_kfold_report(mlpclfcv_results)
mlpclf_kfold_report

{'Accuracy': 0.8410714285714285,
 'F1 Score': 0.8312698412698414,
 'Precision': 0.8583333333333334,
 'Recall': 0.8333333333333333}

# Evaluating Results

In [None]:
pd.DataFrame(
    [
        dt_kfold_report,
        lr_kfold_report,
        rf_kfold_report,
        kn_kfold_report,
        gnb_kfold_report,
        svc_kfold_report,
        mlpclf_kfold_report
    ],
    index=[
        'Decision Tree',
        'Logistic Regresssion',
        'Random Forest',
        'k-nearest neighbors',
        'Gaussian naive Bayes',
        'Support Vector Machine',
        'Mutli-layer Perceptron'
    ]
)

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Decision Tree,0.641071,0.583889,0.611111,0.578968
Logistic Regresssion,0.775,0.753333,0.755556,0.732143
Random Forest,0.785714,0.8,0.766667,0.754524
k-nearest neighbors,0.791071,0.791667,0.788889,0.768413
Gaussian naive Bayes,0.816071,0.841667,0.827778,0.816349
Support Vector Machine,0.826786,0.847778,0.811111,0.809921
Mutli-layer Perceptron,0.841071,0.858333,0.833333,0.83127
