<a href="https://colab.research.google.com/github/vigilant-umbrella/hcv-prediction/blob/main/hcv_finding_alpha.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initialization

In [None]:
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# Data Preprocessing

In [None]:
%%shell
if ! [ -f "hcvdat0.csv" ]; then
    wget https://archive.ics.uci.edu/ml/machine-learning-databases/00571/hcvdat0.csv 
fi



In [None]:
data = pd.read_csv('hcvdat0.csv')

data = data[(data['Category']!='0=Blood Donor') & (data['Category']!='0s=suspect Blood Donor')].reset_index(drop=True)

X = data.drop(['Unnamed: 0', 'Category'], axis=1)
category = pd.DataFrame(data['Category'])

X = pd.concat([X, pd.get_dummies(X.Sex, drop_first=True)], axis=1)
X = X.drop(['Sex'], axis=1)

for col in X.columns:
    X[col] = X[col].fillna(sum(X[col].dropna())/len(X[col].dropna()))

enc = OrdinalEncoder()
y = pd.DataFrame(enc.fit_transform(pd.DataFrame(category)), columns=['category'])

# alpha

In [None]:
target_corr = pd.concat([X, y], axis=1).corr()['category']
target_corr

Age         0.495877
ALB        -0.706416
ALP         0.357904
ALT        -0.059218
AST         0.208790
BIL         0.388040
CHE        -0.716708
CHOL       -0.385596
CREA        0.207934
GGT         0.146027
PROT       -0.289355
m          -0.146990
category    1.000000
Name: category, dtype: float64

In [None]:
alphas_vs_cols = []
# Checking for alpha = 0, 0.1, 0.2, 0.3, 0.4 and 0.5
for alpha in [x*0.1 for x in range(6)]:
    cols_to_remove = []
    for col, value in target_corr.iteritems():
        if abs(value) < alpha:
            cols_to_remove.append(col)

    alphas_vs_cols.append(cols_to_remove)

alphas_vs_cols

[[],
 ['ALT'],
 ['ALT', 'GGT', 'm'],
 ['ALT', 'AST', 'CREA', 'GGT', 'PROT', 'm'],
 ['ALP', 'ALT', 'AST', 'BIL', 'CHOL', 'CREA', 'GGT', 'PROT', 'm'],
 ['Age', 'ALP', 'ALT', 'AST', 'BIL', 'CHOL', 'CREA', 'GGT', 'PROT', 'm']]

# LogisticRegresssion

In [None]:
lr_scores = []
for cols in alphas_vs_cols:
    X_dropped = X.drop(cols, axis=1)
    scaler = MinMaxScaler()
    scaled_X = scaler.fit_transform(X_dropped)

    lr = LogisticRegression(
        C=0.75,
        fit_intercept=True,
        l1_ratio=0,
        max_iter=50,
        penalty='l1',
        random_state=66,
        solver='saga'
        )

    lrcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)

    lrcv_results = cross_validate(
        lr,
        scaled_X,
        y['category'],
        cv=lrcv,
        scoring='accuracy',
        n_jobs=-1
        )

    lr_scores.append(sum(lrcv_results['test_score'])/len(lrcv_results['test_score']))

# KNeighborsClassifier

In [None]:
knclf_scores = []
for cols in alphas_vs_cols:
    X_dropped = X.drop(cols, axis=1)
    scaler = MinMaxScaler()
    scaled_X = scaler.fit_transform(X_dropped)

    knclf = KNeighborsClassifier(
        n_neighbors=9,
        algorithm='ball_tree',
        p=1
        )

    kncv = StratifiedKFold(n_splits=10, shuffle=True, random_state=92)

    kncv_results = cross_validate(
        knclf,
        scaled_X,
        y['category'],
        cv=kncv,
        scoring='accuracy',
        n_jobs=-1
        )

    knclf_scores.append(sum(kncv_results['test_score'])/len(kncv_results['test_score']))

# GaussianNB

In [None]:
gnb_scores = []
for cols in alphas_vs_cols:
    X_dropped = X.drop(cols, axis=1)
    scaler = MinMaxScaler()
    scaled_X = scaler.fit_transform(X_dropped)

    gnb = GaussianNB(var_smoothing=1e-10)

    gnbcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=446)

    gnbcv_results = cross_validate(
        gnb,
        scaled_X,
        y['category'],
        cv=gnbcv,
        scoring='accuracy',
        n_jobs=-1
        )

    gnb_scores.append(sum(gnbcv_results['test_score'])/len(gnbcv_results['test_score']))

# DecisionTreeClassifier

In [None]:
dtclf_scores = []
for cols in alphas_vs_cols:
    X_dropped = X.drop(cols, axis=1)
    scaler = MinMaxScaler()
    scaled_X = scaler.fit_transform(X_dropped)

    dtclf = DecisionTreeClassifier(
        criterion='entropy',
        max_depth=12,
        max_features='sqrt',
        min_samples_leaf=1,
        min_samples_split=0.2,
        random_state=652,
        splitter='random'
        )

    dtcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=60)

    dtcv_results = cross_validate(
        dtclf,
        scaled_X,
        y['category'],
        cv=dtcv,
        scoring='accuracy',
        n_jobs=-1
        )

    dtclf_scores.append(sum(dtcv_results['test_score'])/len(dtcv_results['test_score']))

# RandomForestClassifier

In [None]:
rfclf_scores = []
for cols in alphas_vs_cols:
    X_dropped = X.drop(cols, axis=1)
    scaler = MinMaxScaler()
    scaled_X = scaler.fit_transform(X_dropped)

    rfclf = RandomForestClassifier(
        criterion='gini',
        max_features='sqrt',
        min_samples_leaf=0.1,
        min_samples_split=0.4,
        n_estimators=50,
        random_state=68
        )

    rfcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=480)

    rfcv_results = cross_validate(
        rfclf,
        scaled_X,
        y['category'],
        cv=rfcv,
        scoring='accuracy',
        n_jobs=-1
        )

    rfclf_scores.append(sum(rfcv_results['test_score'])/len(rfcv_results['test_score']))

# SVC

In [None]:
svc_scores = []
for cols in alphas_vs_cols:
    X_dropped = X.drop(cols, axis=1)
    scaler = MinMaxScaler()
    scaled_X = scaler.fit_transform(X_dropped)

    svc = SVC(
        C=1.25,
        coef0=0.9,
        degree=6,
        gamma='auto',
        kernel='poly',
        random_state=0,
        shrinking=True
        )

    svccv = StratifiedKFold(n_splits=10, shuffle=True, random_state=947)

    svccv_results = cross_validate(
        svc,
        scaled_X,
        y['category'],
        cv=svccv,
        scoring='accuracy',
        n_jobs=-1
        )

    svc_scores.append(sum(svccv_results['test_score'])/len(svccv_results['test_score']))

# MLPClassifier

In [None]:
mlpclf_scores = []
for cols in alphas_vs_cols:
    X_dropped = X.drop(cols, axis=1)
    scaler = MinMaxScaler()
    scaled_X = scaler.fit_transform(X_dropped)

    mlpclf = MLPClassifier(
        batch_size=32,
        activation='relu',
        early_stopping=False,
        hidden_layer_sizes=(64, 64),
        max_iter=300,
        random_state=285,
        solver='adam'
        )

    mlpclfcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=289)

    mlpclfcv_results = cross_validate(
        mlpclf,
        scaled_X,
        y['category'],
        cv=mlpclfcv,
        scoring='accuracy',
        n_jobs=-1
        )

    mlpclf_scores.append(sum(mlpclfcv_results['test_score'])/len(mlpclfcv_results['test_score']))

# Evaluating Results

In [None]:
pd.DataFrame(
    [
        lr_scores,
        knclf_scores,
        gnb_scores,
        dtclf_scores,
        rfclf_scores,
        svc_scores,
        mlpclf_scores
    ],
    columns=[x*0.1 for x in range(6)],
    index=[
        'Logistic Regresssion',
        'k-nearest neighbors',
        'Gaussian naive Bayes',
        'Decision Tree',
        'Random Forest',
        'Support Vector Machine',
        'Mutli-layer Perceptron'
    ]
)

Unnamed: 0,0.0,0.1,0.2,0.3,0.4,0.5
Logistic Regresssion,0.746429,0.732143,0.775,0.775,0.775,0.642857
k-nearest neighbors,0.669643,0.696429,0.7875,0.791071,0.760714,0.596429
Gaussian naive Bayes,0.7625,0.721429,0.723214,0.816071,0.735714,0.669643
Decision Tree,0.576786,0.610714,0.5875,0.641071,0.680357,0.658929
Random Forest,0.7625,0.705357,0.721429,0.785714,0.744643,0.705357
Support Vector Machine,0.717857,0.707143,0.730357,0.826786,0.733929,0.667857
Mutli-layer Perceptron,0.732143,0.735714,0.760714,0.841071,0.748214,0.694643
