In [None]:
import pickle
import os
import pandas as pd
import seaborn as sn
import numpy as np
np.random.seed(42)
from sklearn.tree import export_graphviz
from sklearn import utils
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.neighbors import KNeighborsClassifier as knc
from sklearn.model_selection import train_test_split as splt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import matthews_corrcoef as mcc 
from sklearn.metrics import accuracy_score as acc
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from scipy import sparse
import matplotlib.pyplot as plt

In [None]:
dfx = pd.read_excel("Milano_Cleaned_wSCORE.xlsx")

In [None]:
# Thresholds are 0 - 0.01, 0.01 - 0.02, 0.02 - 0.05, 0.05 - 1
y = dfx["SCORE"].values/100
y_cat = np.zeros((y.shape[0]))
for i, val in enumerate(y):
    if(val < 0.01):
        y_cat[i] = 1
    elif(val < 0.02):
        y_cat[i] = 2
    elif(val < 0.05):
        y_cat[i] = 3
    else:
        y_cat[i] = 4

In [None]:
unique, counts = np.unique(y_cat, return_counts=True)
dict(zip(unique, counts))

In [None]:
dfx = dfx.select_dtypes(exclude=['object', 'datetime64'])
dfx = dfx.drop(labels = ['SCORE', 'smoking', 'smoking_recod', 'LDL_calc', 'glucose', 'Unnamed: 0', 'Unnamed: 0.1'], axis=1) #
dfx = dfx.fillna(-1)

In [None]:
X = np.around(dfx.values).astype(int)
imputer = SimpleImputer(missing_values=-1, strategy='median')
imputer.fit_transform(X)
print(X)
X = X + 1
X = X[:,X.max(axis=0) > 0]
X = X/X.max(axis=0)
X.max(axis=0)

In [None]:
def train_rfc(num_iter, X_tr, X_ts, y_tr, y_ts):
    clf = rfc(n_estimators = 1000, max_depth = 100, random_state = 42, class_weight="balanced_subsample")
    clf.fit(X_tr, y_tr)
    print(X_tr.shape)
    print(X_ts.shape)
    print(y_tr.shape)
    print(y_ts.shape)
    y_ts_pred = clf.predict(X_ts)
    mcc1 = mcc(y_ts, y_ts_pred)
    acc1 = acc(y_ts, y_ts_pred)
    cmatrix = np.zeros((4,4))
    for i,y_our in enumerate(y_ts_pred):
        cmatrix[int(y_our)-1][int(y_ts[i])-1] += 1
    df_cm = pd.DataFrame(cmatrix, index = ["No", "Low", "Med", "High"], columns = ["No", "Low", "Med", "High"])
    plt.figure(figsize = (10,7))
    confmat = sn.heatmap(df_cm, annot=True,  fmt='g', cmap="Blues")
    confmat.get_figure().savefig("Graphs/RFC_IMPUTED_CONFUSION_MATRIX_" + str(num_iter) + ".png")
    return acc1, mcc1

In [None]:
skf = StratifiedKFold(n_splits=8, random_state=42, shuffle=False)
skf.get_n_splits(X, y_cat)
metricss = []
c = 0
for train_index, test_index in skf.split(X, y_cat):
    X_tr, X_ts = X[train_index], X[test_index]
    y_tr, y_ts = y_cat[train_index], y_cat[test_index]
    X_tr_sparse = sparse.csr_matrix(X_tr)
    X_ts_sparse = sparse.csr_matrix(X_ts)
    acc1, mcc1 = train_rfc(c, X_tr_sparse, X_ts_sparse, y_tr, y_ts)
    metricss.append((acc1, mcc1))
    c += 1

In [None]:
metricss

In [None]:
temp1 = [a for a,b in metricss]
temp2 = [b for a,b in metricss]

In [None]:
temp1, temp2