<a href="https://colab.research.google.com/github/vigilant-umbrella/hcv-prediction/blob/main/hcv_finding_optimal_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initialization

In [None]:
import pandas as pd
from scipy.stats import kendalltau

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import f_classif

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# Data Preprocessing

In [None]:
%%shell
if ! [ -f "hcvdat0.csv" ]; then
    wget https://archive.ics.uci.edu/ml/machine-learning-databases/00571/hcvdat0.csv 
fi

--2021-09-27 08:49:35--  https://archive.ics.uci.edu/ml/machine-learning-databases/00571/hcvdat0.csv
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 46183 (45K) [application/x-httpd-php]
Saving to: ‘hcvdat0.csv’


2021-09-27 08:49:36 (819 KB/s) - ‘hcvdat0.csv’ saved [46183/46183]





In [None]:
data = pd.read_csv('hcvdat0.csv')

data = data[(data['Category']!='0=Blood Donor') & (data['Category']!='0s=suspect Blood Donor')].reset_index(drop=True)

X = data.drop(['Unnamed: 0', 'Category'], axis=1)
category = pd.DataFrame(data['Category'])

X = pd.concat([X, pd.get_dummies(X.Sex, drop_first=True)], axis=1)
X = X.drop(['Sex'], axis=1)

X['ALB'] = X['ALB'].fillna(X['ALB'].median())
X['ALP'] = X['ALP'].fillna(X['ALP'].median())
X['ALT'] = X['ALT'].fillna(X['ALT'].median())
X['CHOL'] = X['CHOL'].fillna(X['CHOL'].mean())
X['PROT'] = X['PROT'].fillna(X['PROT'].mean())

enc = OrdinalEncoder()
y = pd.DataFrame(enc.fit_transform(pd.DataFrame(category)), columns=['category'])

# Categorical Column 'Sex'

In [None]:
categorical_result = {}

## Logistic Regression

In [None]:
categorical_result['Logistic Regresssion'] = {}

### Keeping 'Sex'

In [None]:
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X)

lr = LogisticRegression(
    C=1.25,
    fit_intercept=True,
    l1_ratio=0.5,
    max_iter=50,
    penalty='elasticnet',
    random_state=221,
    solver='saga'
    )

lrcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

lrcv_results = cross_validate(
    lr,
    scaled_X,
    y['category'],
    cv=lrcv,
    scoring='accuracy',
    n_jobs=-1
    )

categorical_result['Logistic Regresssion']["Keeping 'Sex'"] = sum(lrcv_results['test_score'])/len(lrcv_results['test_score'])

### After removing 'Sex'



In [None]:
X_dropped = X.drop('m', axis=1)
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X_dropped)

lr = LogisticRegression(
    C=1.25,
    fit_intercept=True,
    l1_ratio=0.5,
    max_iter=50,
    penalty='elasticnet',
    random_state=221,
    solver='saga'
    )

lrcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

lrcv_results = cross_validate(
    lr,
    scaled_X,
    y['category'],
    cv=lrcv,
    scoring='accuracy',
    n_jobs=-1
    )

categorical_result['Logistic Regresssion']["Removing 'Sex'"] = sum(lrcv_results['test_score'])/len(lrcv_results['test_score'])

## k-nearest neighbors

In [None]:
categorical_result['k-nearest neighbors'] = {}

### Keeping 'Sex'

In [None]:
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X)

knclf = KNeighborsClassifier(
    n_neighbors=10,
    algorithm='ball_tree',
    p=4
    )

kncv = StratifiedKFold(n_splits=10, shuffle=True, random_state=182)

kncv_results = cross_validate(
    knclf,
    scaled_X,
    y['category'],
    cv=kncv,
    scoring='accuracy',
    n_jobs=-1
    )

categorical_result['k-nearest neighbors']["Keeping 'Sex'"] = sum(kncv_results['test_score'])/len(kncv_results['test_score'])

### After removing 'Sex'



In [None]:
X_dropped = X.drop('m', axis=1)
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X_dropped)

knclf = KNeighborsClassifier(
    n_neighbors=10,
    algorithm='ball_tree',
    p=4
    )

kncv = StratifiedKFold(n_splits=10, shuffle=True, random_state=182)

kncv_results = cross_validate(
    knclf,
    scaled_X,
    y['category'],
    cv=kncv,
    scoring='accuracy',
    n_jobs=-1
    )

categorical_result['k-nearest neighbors']["Removing 'Sex'"] = sum(kncv_results['test_score'])/len(kncv_results['test_score'])

## Gaussian naive Bayes

In [None]:
categorical_result['Gaussian naive Bayes'] = {}

### Keeping 'Sex'

In [None]:
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X)

gnb = GaussianNB(var_smoothing=1e-10)

gnbcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=302)

gnbcv_results = cross_validate(
    gnb,
    scaled_X,
    y['category'],
    cv=gnbcv,
    scoring='accuracy',
    n_jobs=-1
    )

categorical_result['Gaussian naive Bayes']["Keeping 'Sex'"] = sum(gnbcv_results['test_score'])/len(gnbcv_results['test_score'])

### After removing 'Sex'



In [None]:
X_dropped = X.drop('m', axis=1)
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X_dropped)

gnb = GaussianNB(var_smoothing=1e-10)

gnbcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=302)

gnbcv_results = cross_validate(
    gnb,
    scaled_X,
    y['category'],
    cv=gnbcv,
    scoring='accuracy',
    n_jobs=-1
    )

categorical_result['Gaussian naive Bayes']["Removing 'Sex'"] = sum(gnbcv_results['test_score'])/len(gnbcv_results['test_score'])

## Decision Tree

In [None]:
categorical_result['Decision Tree'] = {}

### Keeping 'Sex'

In [None]:
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X)

dtclf = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=7,
    max_features=None,
    min_samples_leaf=3,
    min_samples_split=0.4,
    random_state=559,
    splitter='random'
    )

dtcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=985)

dtcv_results = cross_validate(
    dtclf,
    scaled_X,
    y['category'],
    cv=dtcv,
    scoring='accuracy',
    n_jobs=-1
    )

categorical_result['Decision Tree']["Keeping 'Sex'"] = sum(dtcv_results['test_score'])/len(dtcv_results['test_score'])

### After removing 'Sex'



In [None]:
X_dropped = X.drop('m', axis=1)
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X_dropped)

dtclf = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=7,
    max_features=None,
    min_samples_leaf=3,
    min_samples_split=0.4,
    random_state=559,
    splitter='random'
    )

dtcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=985)

dtcv_results = cross_validate(
    dtclf,
    scaled_X,
    y['category'],
    cv=dtcv,
    scoring='accuracy',
    n_jobs=-1
    )

categorical_result['Decision Tree']["Removing 'Sex'"] = sum(dtcv_results['test_score'])/len(dtcv_results['test_score'])

## Random Forest

In [None]:
categorical_result['Random Forest'] = {}

### Keeping 'Sex'

In [None]:
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X)

rfclf = RandomForestClassifier(
    criterion='gini',
    max_features='sqrt',
    min_samples_leaf=2,
    min_samples_split=5,
    n_estimators=50,
    random_state=67
    )

rfcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=309)

rfcv_results = cross_validate(
    rfclf,
    scaled_X,
    y['category'],
    cv=rfcv,
    scoring='accuracy',
    n_jobs=-1
    )

categorical_result['Random Forest']["Keeping 'Sex'"] = sum(rfcv_results['test_score'])/len(rfcv_results['test_score'])

### After removing 'Sex'



In [None]:
X_dropped = X.drop('m', axis=1)
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X_dropped)

rfclf = RandomForestClassifier(
    criterion='gini',
    max_features='sqrt',
    min_samples_leaf=2,
    min_samples_split=5,
    n_estimators=50,
    random_state=67
    )

rfcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=309)

rfcv_results = cross_validate(
    rfclf,
    scaled_X,
    y['category'],
    cv=rfcv,
    scoring='accuracy',
    n_jobs=-1
    )

categorical_result['Random Forest']["Removing 'Sex'"] = sum(rfcv_results['test_score'])/len(rfcv_results['test_score'])

## Support Vector Machine

In [None]:
categorical_result['Support Vector Machine'] = {}

### Keeping 'Sex'

In [None]:
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X)

svc = SVC(
    C=0.1,
    coef0=0.3,
    degree=2,
    gamma='scale',
    kernel='poly',
    random_state=98,
    shrinking=True
    )

svccv = StratifiedKFold(n_splits=10, shuffle=True, random_state=911)

svccv_results = cross_validate(
    svc,
    scaled_X,
    y['category'],
    cv=svccv,
    scoring='accuracy',
    n_jobs=-1
    )

categorical_result['Support Vector Machine']["Keeping 'Sex'"] = sum(svccv_results['test_score'])/len(svccv_results['test_score'])

### After removing 'Sex'



In [None]:
X_dropped = X.drop('m', axis=1)
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X_dropped)

svc = SVC(
    C=0.1,
    coef0=0.3,
    degree=2,
    gamma='scale',
    kernel='poly',
    random_state=98,
    shrinking=True
    )

svccv = StratifiedKFold(n_splits=10, shuffle=True, random_state=911)

svccv_results = cross_validate(
    svc,
    scaled_X,
    y['category'],
    cv=svccv,
    scoring='accuracy',
    n_jobs=-1
    )

categorical_result['Support Vector Machine']["Removing 'Sex'"] = sum(rfcv_results['test_score'])/len(rfcv_results['test_score'])

## Mutli-layer Perceptron

In [None]:
categorical_result['Mutli-layer Perceptron'] = {}

### Keeping 'Sex'

In [None]:
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X)

mlpclf = MLPClassifier(
    batch_size=8,
    activation='relu',
    early_stopping=False,
    hidden_layer_sizes=(32, 32, 32),
    max_iter=500,
    solver='lbfgs',
    random_state=377
    )

mlpclfcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=496)

mlpclfcv_results = cross_validate(
    mlpclf,
    scaled_X,
    y['category'],
    cv=mlpclfcv,
    scoring='accuracy',
    n_jobs=-1
    )

categorical_result['Mutli-layer Perceptron']["Keeping 'Sex'"] = sum(mlpclfcv_results['test_score'])/len(mlpclfcv_results['test_score'])

### After removing 'Sex'



In [None]:
X_dropped = X.drop('m', axis=1)
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X_dropped)

mlpclf = MLPClassifier(
    batch_size=8,
    activation='relu',
    early_stopping=False,
    hidden_layer_sizes=(32, 32, 32),
    max_iter=500,
    solver='lbfgs',
    random_state=377
    )

mlpclfcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=496)

mlpclfcv_results = cross_validate(
    mlpclf,
    scaled_X,
    y['category'],
    cv=mlpclfcv,
    scoring='accuracy',
    n_jobs=-1
    )

categorical_result['Mutli-layer Perceptron']["Removing 'Sex'"] = sum(mlpclfcv_results['test_score'])/len(mlpclfcv_results['test_score'])

## Result

In [None]:
pd.DataFrame(categorical_result.values(), index=categorical_result.keys())

Unnamed: 0,Keeping 'Sex',Removing 'Sex'
Logistic Regresssion,0.692857,0.760714
k-nearest neighbors,0.669643,0.764286
Gaussian naive Bayes,0.708929,0.696429
Decision Tree,0.571429,0.625
Random Forest,0.678571,0.760714
Support Vector Machine,0.641071,0.760714
Mutli-layer Perceptron,0.6125,0.582143


From the results we can conclude that 'Sex' should be removed.

In [None]:
to_remove_features = ['m']
to_remove_features

['m']

# ANOVA F-value

In [None]:
anova_f_values = f_classif(X.drop(['m'], axis=1), y['category'])[0]

linear_corr = pd.Series(anova_f_values, index=X.drop(['m'], axis=1).columns)
linear_corr

Age     15.299046
ALB     43.959767
ALP      6.677791
ALT      4.877729
AST      1.842203
BIL      8.920381
CHE     48.276278
CHOL     6.290820
CREA     2.095414
GGT      1.386249
PROT     5.837231
dtype: float64

In [None]:
anova_result = {}

## Logistic Regression

In [None]:
anova_result['Logistic Regresssion'] = {}
cols_removed = to_remove_features.copy()

In [None]:
X_dropped = X.drop(cols_removed, axis=1)
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X_dropped)

lr = LogisticRegression(
    C=1.25,
    fit_intercept=True,
    l1_ratio=0.5,
    max_iter=50,
    penalty='elasticnet',
    random_state=221,
    solver='saga'
    )

lrcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

lrcv_results = cross_validate(
    lr,
    scaled_X,
    y['category'],
    cv=lrcv,
    scoring='accuracy',
    n_jobs=-1
    )

anova_result['Logistic Regresssion'][0] = sum(lrcv_results['test_score'])/len(lrcv_results['test_score'])

In [None]:
for col, anova_val in linear_corr.sort_values()[:-1].iteritems():
    cols_removed.append(col)
    X_dropped = X.drop(cols_removed, axis=1)
    scaler = MinMaxScaler()
    scaled_X = scaler.fit_transform(X_dropped)

    lr = LogisticRegression(
        C=1.25,
        fit_intercept=True,
        l1_ratio=0.5,
        max_iter=50,
        penalty='elasticnet',
        random_state=221,
        solver='saga'
        )

    lrcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

    lrcv_results = cross_validate(
        lr,
        scaled_X,
        y['category'],
        cv=lrcv,
        scoring='accuracy',
        n_jobs=-1
        )

    anova_result['Logistic Regresssion'][anova_val] = sum(lrcv_results['test_score'])/len(lrcv_results['test_score'])

## k-nearest neighbors

In [None]:
anova_result['k-nearest neighbors'] = {}
cols_removed = to_remove_features.copy()

In [None]:
X_dropped = X.drop(cols_removed, axis=1)
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X_dropped)

knclf = KNeighborsClassifier(
    n_neighbors=10,
    algorithm='ball_tree',
    p=4
    )

kncv = StratifiedKFold(n_splits=10, shuffle=True, random_state=182)

kncv_results = cross_validate(
    knclf,
    scaled_X,
    y['category'],
    cv=kncv,
    scoring='accuracy',
    n_jobs=-1
    )

anova_result['k-nearest neighbors'][0] = sum(kncv_results['test_score'])/len(kncv_results['test_score'])

In [None]:
for col, anova_val in linear_corr.sort_values()[:-1].iteritems():
    cols_removed.append(col)
    X_dropped = X.drop(cols_removed, axis=1)
    scaler = MinMaxScaler()
    scaled_X = scaler.fit_transform(X_dropped)

    knclf = KNeighborsClassifier(
        n_neighbors=10,
        algorithm='ball_tree',
        p=4
        )

    kncv = StratifiedKFold(n_splits=10, shuffle=True, random_state=182)

    kncv_results = cross_validate(
        knclf,
        scaled_X,
        y['category'],
        cv=kncv,
        scoring='accuracy',
        n_jobs=-1
        )

    anova_result['k-nearest neighbors'][anova_val] = sum(kncv_results['test_score'])/len(kncv_results['test_score'])

## Gaussian naive Bayes

In [None]:
anova_result['Gaussian naive Bayes'] = {}
cols_removed = to_remove_features.copy()

In [None]:
X_dropped = X.drop(cols_removed, axis=1)
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X_dropped)

gnb = GaussianNB(var_smoothing=1e-10)

gnbcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=302)

gnbcv_results = cross_validate(
    gnb,
    scaled_X,
    y['category'],
    cv=gnbcv,
    scoring='accuracy',
    n_jobs=-1
    )

anova_result['Gaussian naive Bayes'][0] = sum(gnbcv_results['test_score'])/len(gnbcv_results['test_score'])

In [None]:
for col, anova_val in linear_corr.sort_values()[:-1].iteritems():
    cols_removed.append(col)
    X_dropped = X.drop(cols_removed, axis=1)
    scaler = MinMaxScaler()
    scaled_X = scaler.fit_transform(X_dropped)

    gnb = GaussianNB(var_smoothing=1e-10)

    gnbcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=302)

    gnbcv_results = cross_validate(
        gnb,
        scaled_X,
        y['category'],
        cv=gnbcv,
        scoring='accuracy',
        n_jobs=-1
        )

    anova_result['Gaussian naive Bayes'][anova_val] = sum(gnbcv_results['test_score'])/len(gnbcv_results['test_score'])

## Decision Tree

In [None]:
anova_result['Decision Tree'] = {}
cols_removed = to_remove_features.copy()

In [None]:
X_dropped = X.drop(cols_removed, axis=1)
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X_dropped)

dtclf = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=7,
    max_features=None,
    min_samples_leaf=3,
    min_samples_split=0.4,
    random_state=559,
    splitter='random'
    )

dtcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=985)

dtcv_results = cross_validate(
    dtclf,
    scaled_X,
    y['category'],
    cv=dtcv,
    scoring='accuracy',
    n_jobs=-1
    )

anova_result['Decision Tree'][0] = sum(dtcv_results['test_score'])/len(dtcv_results['test_score'])

In [None]:
for col, anova_val in linear_corr.sort_values()[:-1].iteritems():
    cols_removed.append(col)
    X_dropped = X.drop(cols_removed, axis=1)
    scaler = MinMaxScaler()
    scaled_X = scaler.fit_transform(X_dropped)

    dtclf = DecisionTreeClassifier(
        criterion='entropy',
        max_depth=7,
        max_features=None,
        min_samples_leaf=3,
        min_samples_split=0.4,
        random_state=559,
        splitter='random'
        )

    dtcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=985)

    dtcv_results = cross_validate(
        dtclf,
        scaled_X,
        y['category'],
        cv=dtcv,
        scoring='accuracy',
        n_jobs=-1
        )

    anova_result['Decision Tree'][anova_val] = sum(dtcv_results['test_score'])/len(dtcv_results['test_score'])

## Random Forest

In [None]:
anova_result['Random Forest'] = {}
cols_removed = to_remove_features.copy()

In [None]:
X_dropped = X.drop(cols_removed, axis=1)
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X_dropped)

rfclf = RandomForestClassifier(
    criterion='gini',
    max_features='sqrt',
    min_samples_leaf=2,
    min_samples_split=5,
    n_estimators=50,
    random_state=67
    )

rfcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=309)

rfcv_results = cross_validate(
    rfclf,
    scaled_X,
    y['category'],
    cv=rfcv,
    scoring='accuracy',
    n_jobs=-1
    )

anova_result['Random Forest'][0] = sum(rfcv_results['test_score'])/len(rfcv_results['test_score'])

In [None]:
for col, anova_val in linear_corr.sort_values()[:-1].iteritems():
    cols_removed.append(col)
    X_dropped = X.drop(cols_removed, axis=1)
    scaler = MinMaxScaler()
    scaled_X = scaler.fit_transform(X_dropped)

    rfclf = RandomForestClassifier(
        criterion='gini',
        max_features='sqrt',
        min_samples_leaf=2,
        min_samples_split=5,
        n_estimators=50,
        random_state=67
        )

    rfcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=309)

    rfcv_results = cross_validate(
        rfclf,
        scaled_X,
        y['category'],
        cv=rfcv,
        scoring='accuracy',
        n_jobs=-1
        )

    anova_result['Random Forest'][anova_val] = sum(rfcv_results['test_score'])/len(rfcv_results['test_score'])

## Support Vector Machine

In [None]:
anova_result['Support Vector Machine'] = {}
cols_removed = to_remove_features.copy()

In [None]:
X_dropped = X.drop(cols_removed, axis=1)
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X_dropped)

svc = SVC(
    C=0.1,
    coef0=0.3,
    degree=2,
    gamma='scale',
    kernel='poly',
    random_state=98,
    shrinking=True
    )

svccv = StratifiedKFold(n_splits=10, shuffle=True, random_state=911)

svccv_results = cross_validate(
    svc,
    scaled_X,
    y['category'],
    cv=svccv,
    scoring='accuracy',
    n_jobs=-1
    )

anova_result['Support Vector Machine'][0] = sum(svccv_results['test_score'])/len(svccv_results['test_score'])

In [None]:
for col, anova_val in linear_corr.sort_values()[:-1].iteritems():
    cols_removed.append(col)
    X_dropped = X.drop(cols_removed, axis=1)
    scaler = MinMaxScaler()
    scaled_X = scaler.fit_transform(X_dropped)

    svc = SVC(
        C=0.1,
        coef0=0.3,
        degree=2,
        gamma='scale',
        kernel='poly',
        random_state=98,
        shrinking=True
        )

    svccv = StratifiedKFold(n_splits=10, shuffle=True, random_state=911)

    svccv_results = cross_validate(
        svc,
        scaled_X,
        y['category'],
        cv=svccv,
        scoring='accuracy',
        n_jobs=-1
        )

    anova_result['Support Vector Machine'][anova_val] = sum(svccv_results['test_score'])/len(svccv_results['test_score'])

## Mutli-layer Perceptron

In [None]:
anova_result['Mutli-layer Perceptron'] = {}
cols_removed = to_remove_features.copy()

In [None]:
X_dropped = X.drop(cols_removed, axis=1)
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X_dropped)

mlpclf = MLPClassifier(
    batch_size=8,
    activation='relu',
    early_stopping=False,
    hidden_layer_sizes=(32, 32, 32),
    max_iter=500,
    solver='lbfgs',
    random_state=377
    )

mlpclfcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=496)

mlpclfcv_results = cross_validate(
    mlpclf,
    scaled_X,
    y['category'],
    cv=mlpclfcv,
    scoring='accuracy',
    n_jobs=-1
    )

anova_result['Mutli-layer Perceptron'][0] = sum(mlpclfcv_results['test_score'])/len(mlpclfcv_results['test_score'])

In [None]:
for col, anova_val in linear_corr.sort_values()[:-1].iteritems():
    cols_removed.append(col)
    X_dropped = X.drop(cols_removed, axis=1)
    scaler = MinMaxScaler()
    scaled_X = scaler.fit_transform(X_dropped)

    mlpclf = MLPClassifier(
        batch_size=8,
        activation='relu',
        early_stopping=False,
        hidden_layer_sizes=(32, 32, 32),
        max_iter=500,
        solver='lbfgs',
        random_state=377
        )

    mlpclfcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=496)

    mlpclfcv_results = cross_validate(
        mlpclf,
        scaled_X,
        y['category'],
        cv=mlpclfcv,
        scoring='accuracy',
        n_jobs=-1
        )

    anova_result['Mutli-layer Perceptron'][anova_val] = sum(mlpclfcv_results['test_score'])/len(mlpclfcv_results['test_score'])

## Result

In [None]:
pd.DataFrame(anova_result.values(), index=anova_result.keys())

Unnamed: 0,0.000000,1.386249,1.842203,2.095414,4.877729,5.837231,6.290820,6.677791,8.920381,15.299046,43.959767
Logistic Regresssion,0.760714,0.789286,0.789286,0.775,0.735714,0.735714,0.708929,0.708929,0.708929,0.6125,0.6125
k-nearest neighbors,0.764286,0.778571,0.766071,0.791071,0.7875,0.75,0.75,0.698214,0.7375,0.6875,0.5875
Gaussian naive Bayes,0.696429,0.708929,0.723214,0.75,0.721429,0.746429,0.773214,0.735714,0.723214,0.671429,0.671429
Decision Tree,0.625,0.707143,0.616071,0.555357,0.676786,0.65,0.642857,0.625,0.641071,0.625,0.555357
Random Forest,0.760714,0.735714,0.708929,0.775,0.746429,0.746429,0.760714,0.732143,0.721429,0.691071,0.578571
Support Vector Machine,0.721429,0.735714,0.7625,0.773214,0.733929,0.764286,0.735714,0.735714,0.725,0.657143,0.6625
Mutli-layer Perceptron,0.582143,0.707143,0.748214,0.666071,0.639286,0.516071,0.598214,0.544643,0.639286,0.569643,0.589286


We can conclude ANOVA value of 1.84 gives best results so it is better to remove features 'GGT' and 'AST'.

In [None]:
to_remove_features += ['GGT', 'AST']
to_remove_features

['m', 'GGT', 'AST']

# Kendall's τ coefficient

In [None]:
kendall_corr = [kendalltau(X[col], y).correlation for col in X.drop(to_remove_features, axis=1).columns]

non_linear_corr = pd.Series(kendall_corr, index=X.drop(to_remove_features, axis=1).columns)
non_linear_corr

Age     0.377397
ALB    -0.597778
ALP     0.410298
ALT    -0.217547
BIL     0.350095
CHE    -0.580503
CHOL   -0.300940
CREA    0.003535
PROT   -0.191451
dtype: float64

In [None]:
kendall_result = {}

## Logistic Regression

In [None]:
kendall_result['Logistic Regresssion'] = {}
cols_removed = to_remove_features.copy()

In [None]:
X_dropped = X.drop(cols_removed, axis=1)
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X_dropped)

lr = LogisticRegression(
    C=1.25,
    fit_intercept=True,
    l1_ratio=0.5,
    max_iter=50,
    penalty='elasticnet',
    random_state=221,
    solver='saga'
    )

lrcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

lrcv_results = cross_validate(
    lr,
    scaled_X,
    y['category'],
    cv=lrcv,
    scoring='accuracy',
    n_jobs=-1
    )

kendall_result['Logistic Regresssion'][0] = sum(lrcv_results['test_score'])/len(lrcv_results['test_score'])

In [None]:
for col, kendall_val in non_linear_corr.abs().sort_values()[:-1].iteritems():
    cols_removed.append(col)
    X_dropped = X.drop(cols_removed, axis=1)
    scaler = MinMaxScaler()
    scaled_X = scaler.fit_transform(X_dropped)

    lr = LogisticRegression(
        C=1.25,
        fit_intercept=True,
        l1_ratio=0.5,
        max_iter=50,
        penalty='elasticnet',
        random_state=221,
        solver='saga'
        )

    lrcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

    lrcv_results = cross_validate(
        lr,
        scaled_X,
        y['category'],
        cv=lrcv,
        scoring='accuracy',
        n_jobs=-1
        )

    kendall_result['Logistic Regresssion'][kendall_val] = sum(lrcv_results['test_score'])/len(lrcv_results['test_score'])

## k-nearest neighbors

In [None]:
kendall_result['k-nearest neighbors'] = {}
cols_removed = to_remove_features.copy()

In [None]:
X_dropped = X.drop(cols_removed, axis=1)
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X_dropped)

knclf = KNeighborsClassifier(
    n_neighbors=10,
    algorithm='ball_tree',
    p=4
    )

kncv = StratifiedKFold(n_splits=10, shuffle=True, random_state=182)

kncv_results = cross_validate(
    knclf,
    scaled_X,
    y['category'],
    cv=kncv,
    scoring='accuracy',
    n_jobs=-1
    )

kendall_result['k-nearest neighbors'][0] = sum(kncv_results['test_score'])/len(kncv_results['test_score'])

In [None]:
for col, kendall_val in non_linear_corr.abs().sort_values()[:-1].iteritems():
    cols_removed.append(col)
    X_dropped = X.drop(cols_removed, axis=1)
    scaler = MinMaxScaler()
    scaled_X = scaler.fit_transform(X_dropped)

    knclf = KNeighborsClassifier(
        n_neighbors=10,
        algorithm='ball_tree',
        p=4
        )

    kncv = StratifiedKFold(n_splits=10, shuffle=True, random_state=182)

    kncv_results = cross_validate(
        knclf,
        scaled_X,
        y['category'],
        cv=kncv,
        scoring='accuracy',
        n_jobs=-1
        )

    kendall_result['k-nearest neighbors'][kendall_val] = sum(kncv_results['test_score'])/len(kncv_results['test_score'])

## Gaussian naive Bayes

In [None]:
kendall_result['Gaussian naive Bayes'] = {}
cols_removed = to_remove_features.copy()

In [None]:
X_dropped = X.drop(cols_removed, axis=1)
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X_dropped)

gnb = GaussianNB(var_smoothing=1e-10)

gnbcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=302)

gnbcv_results = cross_validate(
    gnb,
    scaled_X,
    y['category'],
    cv=gnbcv,
    scoring='accuracy',
    n_jobs=-1
    )

kendall_result['Gaussian naive Bayes'][0] = sum(gnbcv_results['test_score'])/len(gnbcv_results['test_score'])

In [None]:
for col, kendall_val in non_linear_corr.abs().sort_values()[:-1].iteritems():
    cols_removed.append(col)
    X_dropped = X.drop(cols_removed, axis=1)
    scaler = MinMaxScaler()
    scaled_X = scaler.fit_transform(X_dropped)

    gnb = GaussianNB(var_smoothing=1e-10)

    gnbcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=302)

    gnbcv_results = cross_validate(
        gnb,
        scaled_X,
        y['category'],
        cv=gnbcv,
        scoring='accuracy',
        n_jobs=-1
        )

    kendall_result['Gaussian naive Bayes'][kendall_val] = sum(gnbcv_results['test_score'])/len(gnbcv_results['test_score'])

## Decision Tree

In [None]:
kendall_result['Decision Tree'] = {}
cols_removed = to_remove_features.copy()

In [None]:
X_dropped = X.drop(cols_removed, axis=1)
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X_dropped)

dtclf = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=7,
    max_features=None,
    min_samples_leaf=3,
    min_samples_split=0.4,
    random_state=559,
    splitter='random'
    )

dtcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=985)

dtcv_results = cross_validate(
    dtclf,
    scaled_X,
    y['category'],
    cv=dtcv,
    scoring='accuracy',
    n_jobs=-1
    )

kendall_result['Decision Tree'][0] = sum(dtcv_results['test_score'])/len(dtcv_results['test_score'])

In [None]:
for col, kendall_val in non_linear_corr.abs().sort_values()[:-1].iteritems():
    cols_removed.append(col)
    X_dropped = X.drop(cols_removed, axis=1)
    scaler = MinMaxScaler()
    scaled_X = scaler.fit_transform(X_dropped)

    dtclf = DecisionTreeClassifier(
        criterion='entropy',
        max_depth=7,
        max_features=None,
        min_samples_leaf=3,
        min_samples_split=0.4,
        random_state=559,
        splitter='random'
        )

    dtcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=985)

    dtcv_results = cross_validate(
        dtclf,
        scaled_X,
        y['category'],
        cv=dtcv,
        scoring='accuracy',
        n_jobs=-1
        )

    kendall_result['Decision Tree'][kendall_val] = sum(dtcv_results['test_score'])/len(dtcv_results['test_score'])

## Random Forest

In [None]:
kendall_result['Random Forest'] = {}
cols_removed = to_remove_features.copy()

In [None]:
X_dropped = X.drop(cols_removed, axis=1)
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X_dropped)

rfclf = RandomForestClassifier(
    criterion='gini',
    max_features='sqrt',
    min_samples_leaf=2,
    min_samples_split=5,
    n_estimators=50,
    random_state=67
    )

rfcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=309)

rfcv_results = cross_validate(
    rfclf,
    scaled_X,
    y['category'],
    cv=rfcv,
    scoring='accuracy',
    n_jobs=-1
    )

kendall_result['Random Forest'][0] = sum(rfcv_results['test_score'])/len(rfcv_results['test_score'])

In [None]:
for col, kendall_val in non_linear_corr.abs().sort_values()[:-1].iteritems():
    cols_removed.append(col)
    X_dropped = X.drop(cols_removed, axis=1)
    scaler = MinMaxScaler()
    scaled_X = scaler.fit_transform(X_dropped)

    rfclf = RandomForestClassifier(
        criterion='gini',
        max_features='sqrt',
        min_samples_leaf=2,
        min_samples_split=5,
        n_estimators=50,
        random_state=67
        )

    rfcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=309)

    rfcv_results = cross_validate(
        rfclf,
        scaled_X,
        y['category'],
        cv=rfcv,
        scoring='accuracy',
        n_jobs=-1
        )

    kendall_result['Random Forest'][kendall_val] = sum(rfcv_results['test_score'])/len(rfcv_results['test_score'])

## Support Vector Machine

In [None]:
kendall_result['Support Vector Machine'] = {}
cols_removed = to_remove_features.copy()

In [None]:
X_dropped = X.drop(cols_removed, axis=1)
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X_dropped)

svc = SVC(
    C=0.1,
    coef0=0.3,
    degree=2,
    gamma='scale',
    kernel='poly',
    random_state=98,
    shrinking=True
    )

svccv = StratifiedKFold(n_splits=10, shuffle=True, random_state=911)

svccv_results = cross_validate(
    svc,
    scaled_X,
    y['category'],
    cv=svccv,
    scoring='accuracy',
    n_jobs=-1
    )

kendall_result['Support Vector Machine'][0] = sum(svccv_results['test_score'])/len(svccv_results['test_score'])

In [None]:
for col, kendall_val in non_linear_corr.abs().sort_values()[:-1].iteritems():
    cols_removed.append(col)
    X_dropped = X.drop(cols_removed, axis=1)
    scaler = MinMaxScaler()
    scaled_X = scaler.fit_transform(X_dropped)

    svc = SVC(
        C=0.1,
        coef0=0.3,
        degree=2,
        gamma='scale',
        kernel='poly',
        random_state=98,
        shrinking=True
        )

    svccv = StratifiedKFold(n_splits=10, shuffle=True, random_state=911)

    svccv_results = cross_validate(
        svc,
        scaled_X,
        y['category'],
        cv=svccv,
        scoring='accuracy',
        n_jobs=-1
        )

    kendall_result['Support Vector Machine'][kendall_val] = sum(svccv_results['test_score'])/len(svccv_results['test_score'])

## Mutli-layer Perceptron

In [None]:
kendall_result['Mutli-layer Perceptron'] = {}
cols_removed = to_remove_features.copy()

In [None]:
X_dropped = X.drop(cols_removed, axis=1)
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X_dropped)

mlpclf = MLPClassifier(
    batch_size=8,
    activation='relu',
    early_stopping=False,
    hidden_layer_sizes=(32, 32, 32),
    max_iter=500,
    random_state=377,
    solver='lbfgs'
    )

mlpclfcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=496)

mlpclfcv_results = cross_validate(
    mlpclf,
    scaled_X,
    y['category'],
    cv=mlpclfcv,
    scoring='accuracy',
    n_jobs=-1
    )

kendall_result['Mutli-layer Perceptron'][0] = sum(mlpclfcv_results['test_score'])/len(mlpclfcv_results['test_score'])

In [None]:
for col, kendall_val in non_linear_corr.abs().sort_values()[:-1].iteritems():
    cols_removed.append(col)
    X_dropped = X.drop(cols_removed, axis=1)
    scaler = MinMaxScaler()
    scaled_X = scaler.fit_transform(X_dropped)

    mlpclf = MLPClassifier(
        batch_size=8,
        activation='relu',
        early_stopping=False,
        hidden_layer_sizes=(32, 32, 32),
        max_iter=500,
        random_state=377,
        solver='lbfgs'
        )

    mlpclfcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=496)

    mlpclfcv_results = cross_validate(
        mlpclf,
        scaled_X,
        y['category'],
        cv=mlpclfcv,
        scoring='accuracy',
        n_jobs=-1
        )

    kendall_result['Mutli-layer Perceptron'][kendall_val] = sum(mlpclfcv_results['test_score'])/len(mlpclfcv_results['test_score'])

## Result

In [None]:
pd.DataFrame(kendall_result.values(), index=kendall_result.keys())

Unnamed: 0,0.000000,0.003535,0.191451,0.217547,0.300940,0.350095,0.377397,0.410298,0.580503
Logistic Regresssion,0.789286,0.775,0.803571,0.735714,0.708929,0.708929,0.6125,0.6125,0.65
k-nearest neighbors,0.766071,0.791071,0.816071,0.75,0.75,0.748214,0.664286,0.6875,0.653571
Gaussian naive Bayes,0.723214,0.75,0.7875,0.746429,0.773214,0.733929,0.639286,0.671429,0.621429
Decision Tree,0.616071,0.555357,0.748214,0.65,0.642857,0.625,0.619643,0.625,0.401786
Random Forest,0.708929,0.775,0.8,0.746429,0.760714,0.733929,0.65,0.691071,0.623214
Support Vector Machine,0.7625,0.773214,0.817857,0.764286,0.735714,0.733929,0.682143,0.657143,0.614286
Mutli-layer Perceptron,0.748214,0.666071,0.867857,0.516071,0.598214,0.707143,0.605357,0.569643,0.626786


We can conclude Kendall's value of 0.191451 gives best results so it is better to remove features CREA and PROT.

In [None]:
to_remove_features += ['CREA', 'PROT']
to_remove_features

['m', 'GGT', 'AST', 'CREA', 'PROT']