<a href="https://colab.research.google.com/github/vigilant-umbrella/hcv-prediction/blob/main/hcv_finding_alpha_beta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initialization

In [None]:
import pandas as pd
from scipy.stats import kendalltau

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import f_classif

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# Data Preprocessing

In [None]:
%%shell
if ! [ -f "hcvdat0.csv" ]; then
    wget https://archive.ics.uci.edu/ml/machine-learning-databases/00571/hcvdat0.csv 
fi

--2021-08-12 17:39:10--  https://archive.ics.uci.edu/ml/machine-learning-databases/00571/hcvdat0.csv
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 46183 (45K) [application/x-httpd-php]
Saving to: ‘hcvdat0.csv’


2021-08-12 17:39:10 (324 KB/s) - ‘hcvdat0.csv’ saved [46183/46183]





In [None]:
data = pd.read_csv('hcvdat0.csv')

data = data[(data['Category']!='0=Blood Donor') & (data['Category']!='0s=suspect Blood Donor')].reset_index(drop=True)

X = data.drop(['Unnamed: 0', 'Category'], axis=1)
category = pd.DataFrame(data['Category'])

X = pd.concat([X, pd.get_dummies(X.Sex, drop_first=True)], axis=1)
X = X.drop(['Sex'], axis=1)

X['ALB'] = X['ALB'].fillna(X['ALB'].median())
X['ALP'] = X['ALP'].fillna(X['ALP'].median())
X['ALT'] = X['ALT'].fillna(X['ALT'].median())
X['CHOL'] = X['CHOL'].fillna(X['CHOL'].mean())
X['PROT'] = X['PROT'].fillna(X['PROT'].mean())

enc = OrdinalEncoder()
y = pd.DataFrame(enc.fit_transform(pd.DataFrame(category)), columns=['category'])

# alpha

In [None]:
anova_f_values = f_classif(X.drop(['m'], axis=1), y['category'])[0]

linear_corr = pd.Series(anova_f_values, index=X.drop(['m'], axis=1).columns)
linear_corr

Age     15.299046
ALB     43.959767
ALP      6.677791
ALT      4.877729
AST      1.842203
BIL      8.920381
CHE     48.276278
CHOL     6.290820
CREA     2.095414
GGT      1.386249
PROT     5.837231
dtype: float64

In [None]:
alphas_vs_cols = {}
alphas_vs_cols[0] = []
temp = []
for col, val in linear_corr.sort_values().iteritems():
    temp.append(col)
    alphas_vs_cols[val] = temp.copy()

del alphas_vs_cols[linear_corr.sort_values().iloc[-1]]

alphas_vs_cols

{0: [],
 1.3862492060605132: ['GGT'],
 1.8422028019458865: ['GGT', 'AST'],
 2.0954135545152184: ['GGT', 'AST', 'CREA'],
 4.87772939960541: ['GGT', 'AST', 'CREA', 'ALT'],
 5.837231107909143: ['GGT', 'AST', 'CREA', 'ALT', 'PROT'],
 6.290820252984408: ['GGT', 'AST', 'CREA', 'ALT', 'PROT', 'CHOL'],
 6.677790788720345: ['GGT', 'AST', 'CREA', 'ALT', 'PROT', 'CHOL', 'ALP'],
 8.920380534925785: ['GGT',
  'AST',
  'CREA',
  'ALT',
  'PROT',
  'CHOL',
  'ALP',
  'BIL'],
 15.29904550025325: ['GGT',
  'AST',
  'CREA',
  'ALT',
  'PROT',
  'CHOL',
  'ALP',
  'BIL',
  'Age'],
 43.959767312315705: ['GGT',
  'AST',
  'CREA',
  'ALT',
  'PROT',
  'CHOL',
  'ALP',
  'BIL',
  'Age',
  'ALB']}

# beta

In [None]:
kendall_corr = [kendalltau(X[col], y).correlation for col in X.drop(['m'], axis=1).columns]

non_linear_corr = pd.Series(kendall_corr, index=X.drop(['m'], axis=1).columns)
non_linear_corr

Age     0.377397
ALB    -0.597778
ALP     0.410298
ALT    -0.217547
AST     0.217547
BIL     0.350095
CHE    -0.580503
CHOL   -0.300940
CREA    0.003535
GGT     0.185809
PROT   -0.191451
dtype: float64

In [None]:
betas_vs_cols = {}
# Checking for beta = 0, 0.1, 0.2, 0.3, 0.4 and 0.5
for beta in [x*0.1 for x in range(6)]:
    cols_to_remove = []
    for col, value in non_linear_corr.iteritems():
        if abs(value) < beta:
            cols_to_remove.append(col)

    betas_vs_cols[beta] = cols_to_remove

betas_vs_cols

{0.0: [],
 0.1: ['CREA'],
 0.2: ['CREA', 'GGT', 'PROT'],
 0.30000000000000004: ['ALT', 'AST', 'CREA', 'GGT', 'PROT'],
 0.4: ['Age', 'ALT', 'AST', 'BIL', 'CHOL', 'CREA', 'GGT', 'PROT'],
 0.5: ['Age', 'ALP', 'ALT', 'AST', 'BIL', 'CHOL', 'CREA', 'GGT', 'PROT']}

# Parameter class

In [None]:
class Parameters():
    def __init__(self, alpha, beta, is_m_used):
        self.alpha = alpha
        self.beta = beta
        self.is_m_used = is_m_used

# Utils

In [None]:
def get_best_values(scores, parameters):
    alpha = parameters[scores.index(max(scores))].alpha
    beta = parameters[scores.index(max(scores))].beta
    is_m_used = parameters[scores.index(max(scores))].is_m_used

    return alpha, beta, is_m_used

In [None]:
def get_different_alpha_results(beta, scores, parameters):
    results = []
    for i, parameter in enumerate(parameters):
        if parameter.beta == beta and not parameter.is_m_used:
            results.append(scores[i])

    return results

In [None]:
def get_different_beta_results(alpha, scores, parameters):
    results = []
    for i, parameter in enumerate(parameters):
        if parameter.alpha == alpha and not parameter.is_m_used:
            results.append(scores[i])

    return results

# LogisticRegresssion

In [None]:
lr_scores = []
lr_parameters = []
for is_m_used, m_col in [(True, []), (False, ['m'])]:
    for alpha, alpha_cols in alphas_vs_cols.items():
        for beta, beta_cols in betas_vs_cols.items():
            cols = set(m_col+alpha_cols+beta_cols)
            X_dropped = X.drop(cols, axis=1)
            scaler = MinMaxScaler()
            scaled_X = scaler.fit_transform(X_dropped)

            lr = LogisticRegression(
                C=1.25,
                fit_intercept=True,
                l1_ratio=0.5,
                max_iter=50,
                penalty='elasticnet',
                random_state=221,
                solver='saga'
                )

            lrcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

            lrcv_results = cross_validate(
                lr,
                scaled_X,
                y['category'],
                cv=lrcv,
                scoring='accuracy',
                n_jobs=-1
                )

            lr_scores.append(sum(lrcv_results['test_score'])/len(lrcv_results['test_score']))
            lr_parameters.append(Parameters(alpha, beta, is_m_used))

# KNeighborsClassifier

In [None]:
knclf_scores = []
knclf_parameters = []
for is_m_used, m_col in [(True, []), (False, ['m'])]:
    for alpha, alpha_cols in alphas_vs_cols.items():
        for beta, beta_cols in betas_vs_cols.items():
            cols = set(m_col+alpha_cols+beta_cols)
            X_dropped = X.drop(cols, axis=1)
            scaler = MinMaxScaler()
            scaled_X = scaler.fit_transform(X_dropped)

            knclf = KNeighborsClassifier(
                n_neighbors=10,
                algorithm='ball_tree',
                p=4
                )

            kncv = StratifiedKFold(n_splits=10, shuffle=True, random_state=182)

            kncv_results = cross_validate(
                knclf,
                scaled_X,
                y['category'],
                cv=kncv,
                scoring='accuracy',
                n_jobs=-1
                )

            knclf_scores.append(sum(kncv_results['test_score'])/len(kncv_results['test_score']))
            knclf_parameters.append(Parameters(alpha, beta, is_m_used))

# GaussianNB

In [None]:
gnb_scores = []
gnb_parameters = []
for is_m_used, m_col in [(True, []), (False, ['m'])]:
    for alpha, alpha_cols in alphas_vs_cols.items():
        for beta, beta_cols in betas_vs_cols.items():
            cols = set(m_col+alpha_cols+beta_cols)
            X_dropped = X.drop(cols, axis=1)
            scaler = MinMaxScaler()
            scaled_X = scaler.fit_transform(X_dropped)

            gnb = GaussianNB(var_smoothing=1e-10)

            gnbcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=302)

            gnbcv_results = cross_validate(
                gnb,
                scaled_X,
                y['category'],
                cv=gnbcv,
                scoring='accuracy',
                n_jobs=-1
                )

            gnb_scores.append(sum(gnbcv_results['test_score'])/len(gnbcv_results['test_score']))
            gnb_parameters.append(Parameters(alpha, beta, is_m_used))

# DecisionTreeClassifier

In [None]:
dtclf_scores = []
dtclf_parameters = []
for is_m_used, m_col in [(True, []), (False, ['m'])]:
    for alpha, alpha_cols in alphas_vs_cols.items():
        for beta, beta_cols in betas_vs_cols.items():
            cols = set(m_col+alpha_cols+beta_cols)
            X_dropped = X.drop(cols, axis=1)
            scaler = MinMaxScaler()
            scaled_X = scaler.fit_transform(X_dropped)

            dtclf = DecisionTreeClassifier(
                criterion='entropy',
                max_depth=7,
                max_features=None,
                min_samples_leaf=3,
                min_samples_split=0.4,
                random_state=559,
                splitter='random'
                )

            dtcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=985)

            dtcv_results = cross_validate(
                dtclf,
                scaled_X,
                y['category'],
                cv=dtcv,
                scoring='accuracy',
                n_jobs=-1
                )

            dtclf_scores.append(sum(dtcv_results['test_score'])/len(dtcv_results['test_score']))
            dtclf_parameters.append(Parameters(alpha, beta, is_m_used))

# RandomForestClassifier

In [None]:
rfclf_scores = []
rfclf_parameters = []
for is_m_used, m_col in [(True, []), (False, ['m'])]:
    for alpha, alpha_cols in alphas_vs_cols.items():
        for beta, beta_cols in betas_vs_cols.items():
            cols = set(m_col+alpha_cols+beta_cols)
            X_dropped = X.drop(cols, axis=1)
            scaler = MinMaxScaler()
            scaled_X = scaler.fit_transform(X_dropped)

            rfclf = RandomForestClassifier(
                criterion='gini',
                max_features='sqrt',
                min_samples_leaf=2,
                min_samples_split=5,
                n_estimators=50,
                random_state=67
                )

            rfcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=309)

            rfcv_results = cross_validate(
                rfclf,
                scaled_X,
                y['category'],
                cv=rfcv,
                scoring='accuracy',
                n_jobs=-1
                )

            rfclf_scores.append(sum(rfcv_results['test_score'])/len(rfcv_results['test_score']))
            rfclf_parameters.append(Parameters(alpha, beta, is_m_used))

# SVC

In [None]:
svc_scores = []
svc_parameters = []
for is_m_used, m_col in [(True, []), (False, ['m'])]:
    for alpha, alpha_cols in alphas_vs_cols.items():
        for beta, beta_cols in betas_vs_cols.items():
            cols = set(m_col+alpha_cols+beta_cols)
            X_dropped = X.drop(cols, axis=1)
            scaler = MinMaxScaler()
            scaled_X = scaler.fit_transform(X_dropped)

            svc = SVC(
                C=0.1,
                coef0=0.3,
                degree=2,
                gamma='scale',
                kernel='poly',
                random_state=98,
                shrinking=True
                )

            svccv = StratifiedKFold(n_splits=10, shuffle=True, random_state=911)

            svccv_results = cross_validate(
                svc,
                scaled_X,
                y['category'],
                cv=svccv,
                scoring='accuracy',
                n_jobs=-1
                )

            svc_scores.append(sum(svccv_results['test_score'])/len(svccv_results['test_score']))
            svc_parameters.append(Parameters(alpha, beta, is_m_used))

# MLPClassifier

In [None]:
mlpclf_scores = []
mlpclf_parameters = []
for is_m_used, m_col in [(True, []), (False, ['m'])]:
    for alpha, alpha_cols in alphas_vs_cols.items():
        for beta, beta_cols in betas_vs_cols.items():
            cols = set(m_col+alpha_cols+beta_cols)
            X_dropped = X.drop(cols, axis=1)
            scaler = MinMaxScaler()
            scaled_X = scaler.fit_transform(X_dropped)

            mlpclf = MLPClassifier(
                batch_size=8,
                activation='relu',
                early_stopping=False,
                hidden_layer_sizes=(32, 32, 32),
                max_iter=500,
                random_state=377,
                solver='lbfgs'
                )

            mlpclfcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=496)

            mlpclfcv_results = cross_validate(
                mlpclf,
                scaled_X,
                y['category'],
                cv=mlpclfcv,
                scoring='accuracy',
                n_jobs=-1
                )

            mlpclf_scores.append(sum(mlpclfcv_results['test_score'])/len(mlpclfcv_results['test_score']))
            mlpclf_parameters.append(Parameters(alpha, beta, is_m_used))

# Evaluating Results

In [None]:
pd.DataFrame(
    [
        get_best_values(lr_scores, lr_parameters),
        get_best_values(knclf_scores, knclf_parameters),
        get_best_values(gnb_scores, gnb_parameters),
        get_best_values(dtclf_scores, dtclf_parameters),
        get_best_values(rfclf_scores, rfclf_parameters),
        get_best_values(svc_scores, svc_parameters),
        get_best_values(mlpclf_scores, mlpclf_parameters)
    ],
    columns=['alpha', 'beta', 'is_m_used'],
    index=[
        'Logistic Regresssion',
        'k-nearest neighbors',
        'Gaussian naive Bayes',
        'Decision Tree',
        'Random Forest',
        'Support Vector Machine',
        'Mutli-layer Perceptron'
    ]
)

Unnamed: 0,alpha,beta,is_m_used
Logistic Regresssion,0.0,0.2,False
k-nearest neighbors,0.0,0.2,False
Gaussian naive Bayes,1.842203,0.2,False
Decision Tree,1.842203,0.2,False
Random Forest,1.842203,0.2,False
Support Vector Machine,1.842203,0.2,False
Mutli-layer Perceptron,1.842203,0.2,False


In [None]:
pd.DataFrame(
    [
        get_different_alpha_results(0.2, lr_scores, lr_parameters),
        get_different_alpha_results(0.2, knclf_scores, knclf_parameters),
        get_different_alpha_results(0.2, gnb_scores, gnb_parameters),
        get_different_alpha_results(0.2, dtclf_scores, dtclf_parameters),
        get_different_alpha_results(0.2, rfclf_scores, rfclf_parameters),
        get_different_alpha_results(0.2, svc_scores, svc_parameters),
        get_different_alpha_results(0.2, mlpclf_scores, mlpclf_parameters)
    ],
    columns=alphas_vs_cols.keys(),
    index=[
        'Logistic Regresssion',
        'k-nearest neighbors',
        'Gaussian naive Bayes',
        'Decision Tree',
        'Random Forest',
        'Support Vector Machine',
        'Mutli-layer Perceptron'
    ]
)

Unnamed: 0,0.000000,1.386249,1.842203,2.095414,4.877729,5.837231,6.290820,6.677791,8.920381,15.299046,43.959767
Logistic Regresssion,0.803571,0.803571,0.803571,0.803571,0.735714,0.735714,0.708929,0.708929,0.708929,0.6125,0.6125
k-nearest neighbors,0.817857,0.817857,0.816071,0.816071,0.75,0.75,0.75,0.698214,0.7375,0.6875,0.5875
Gaussian naive Bayes,0.758929,0.758929,0.7875,0.7875,0.746429,0.746429,0.773214,0.735714,0.723214,0.671429,0.671429
Decision Tree,0.732143,0.732143,0.748214,0.748214,0.65,0.65,0.642857,0.625,0.641071,0.625,0.555357
Random Forest,0.694643,0.694643,0.8,0.8,0.746429,0.746429,0.760714,0.732143,0.721429,0.691071,0.578571
Support Vector Machine,0.789286,0.789286,0.817857,0.817857,0.764286,0.764286,0.735714,0.735714,0.725,0.657143,0.6625
Mutli-layer Perceptron,0.669643,0.669643,0.867857,0.867857,0.516071,0.516071,0.598214,0.544643,0.639286,0.569643,0.589286


In [None]:
pd.DataFrame(
    [
        get_different_beta_results(1.8422028019458865, lr_scores, lr_parameters),
        get_different_beta_results(1.8422028019458865, knclf_scores, knclf_parameters),
        get_different_beta_results(1.8422028019458865, gnb_scores, gnb_parameters),
        get_different_beta_results(1.8422028019458865, dtclf_scores, dtclf_parameters),
        get_different_beta_results(1.8422028019458865, rfclf_scores, rfclf_parameters),
        get_different_beta_results(1.8422028019458865, svc_scores, svc_parameters),
        get_different_beta_results(1.8422028019458865, mlpclf_scores, mlpclf_parameters)
    ],
    columns=[x*0.1 for x in range(6)],
    index=[
        'Logistic Regresssion',
        'k-nearest neighbors',
        'Gaussian naive Bayes',
        'Decision Tree',
        'Random Forest',
        'Support Vector Machine',
        'Mutli-layer Perceptron'
    ]
)

Unnamed: 0,0.0,0.1,0.2,0.3,0.4,0.5
Logistic Regresssion,0.789286,0.775,0.803571,0.735714,0.6125,0.6125
k-nearest neighbors,0.766071,0.791071,0.816071,0.75,0.664286,0.6875
Gaussian naive Bayes,0.723214,0.75,0.7875,0.746429,0.639286,0.671429
Decision Tree,0.616071,0.555357,0.748214,0.65,0.619643,0.625
Random Forest,0.708929,0.775,0.8,0.746429,0.65,0.691071
Support Vector Machine,0.7625,0.773214,0.817857,0.764286,0.682143,0.657143
Mutli-layer Perceptron,0.748214,0.666071,0.867857,0.516071,0.605357,0.569643
