## !!! Baseline ML models !!!

In [1]:
import numpy as np
import pandas as pd

### Loading data
* Using normalized data from the pre-processing step.
* Target classes are encoded using lable encoding

In [2]:
d_f = pd.read_csv("processed-data/data-normlized.csv.gz")
d_f = d_f.drop(["Unnamed: 0"], axis=1)

In [3]:
d_f.head()

Unnamed: 0,HIC_antibodies,HIV,HbsAg,Age,r_f_d,u_red_r,v_f_amount,Vlm,label,p_id,...,State_PR,State_RJ,State_RN,State_RO,State_RR,State_RS,State_SC,State_SE,State_SP,State_TO
0,0,0,0,0.35,0.25,0.596637,0.553054,0.297873,N180,37378,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0.3375,0.25,0.596637,0.553054,0.297873,N180,37378,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0.3625,0.25,0.596637,0.553054,0.297873,N180,37378,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0.325,0.25,0.596637,0.553054,0.297873,N180,37378,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0.35,0.25,0.596637,0.553054,0.297873,N180,37378,...,1,0,0,0,0,0,0,0,0,0


In [4]:
import warnings
warnings.filterwarnings("ignore")

d_f["label"] = d_f["label"].astype('category')
d_f["label_cat"] = d_f["label"].cat.codes

In [5]:
d_f.head()

Unnamed: 0,HIC_antibodies,HIV,HbsAg,Age,r_f_d,u_red_r,v_f_amount,Vlm,label,p_id,...,State_RJ,State_RN,State_RO,State_RR,State_RS,State_SC,State_SE,State_SP,State_TO,label_cat
0,0,0,0,0.35,0.25,0.596637,0.553054,0.297873,N180,37378,...,0,0,0,0,0,0,0,0,0,7
1,0,0,0,0.3375,0.25,0.596637,0.553054,0.297873,N180,37378,...,0,0,0,0,0,0,0,0,0,7
2,0,0,0,0.3625,0.25,0.596637,0.553054,0.297873,N180,37378,...,0,0,0,0,0,0,0,0,0,7
3,0,0,0,0.325,0.25,0.596637,0.553054,0.297873,N180,37378,...,0,0,0,0,0,0,0,0,0,7
4,0,0,0,0.35,0.25,0.596637,0.553054,0.297873,N180,37378,...,0,0,0,0,0,0,0,0,0,7


In [6]:
df_s = d_f.sample(frac=0.1)
print(len(df_s))
print(len(d_f))
Y = df_s["label_cat"]
X = df_s.drop(['label', 'label_cat', 'p_id'], axis=1)

339338
3393385


In [7]:
X.head()

Unnamed: 0,HIC_antibodies,HIV,HbsAg,Age,r_f_d,u_red_r,v_f_amount,Vlm,Procedure_305010026,Procedure_305010069,...,State_PR,State_RJ,State_RN,State_RO,State_RR,State_RS,State_SC,State_SE,State_SP,State_TO
329291,0,0,0,0.9125,0.25,0.596637,0.553054,0.297873,0,0,...,0,0,0,0,0,0,0,0,0,0
74341,1,0,0,0.5375,0.25,0.596638,0.553097,0.297873,0,0,...,0,0,0,0,0,0,0,0,0,0
1115207,0,0,0,0.925,0.25,0.596637,0.553068,0.297873,0,0,...,1,0,0,0,0,0,0,0,0,0
882626,0,0,0,0.5875,0.25,0.596637,0.553054,0.297875,0,0,...,1,0,0,0,0,0,0,0,0,0
3304160,0,0,0,0.75,0.125,0.596637,0.553054,0.297874,0,0,...,0,0,0,0,0,0,0,0,1,0


In [8]:
num_classes = len(Y.unique())
print("number of classes:", num_classes)

number of classes: 10


In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    Y,
    test_size=0.2,
    random_state=42,
)

In [10]:
num_train_classes = len(y_train.unique())
num_test_classes = len(y_test.unique())
print("number of train classes:", num_train_classes)
print("number of test classes:", num_test_classes)

number of train classes: 10
number of test classes: 10


### Evaluation metrics

In [11]:
from sklearn.metrics import accuracy_score, \
precision_recall_fscore_support, recall_score, f1_score
def print_metrics(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {p:.4f}")
    print(f"Recall: {r:.4f}")
    print(f"F1 Score: {f1:.4f}")

### Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train, y_train)

LogisticRegression(solver='liblinear')

In [13]:
y_pred_lr = lr_clf.predict(X_test)

In [14]:
print_metrics(y_test, y_pred_lr)

Accuracy: 0.9825
Precision: 0.9653
Recall: 0.9825
F1 Score: 0.9738


### Naive Bayes

In [15]:
from sklearn.naive_bayes import BernoulliNB
nb_clf = BernoulliNB().fit(X_train, y_train)

In [16]:
y_pred_nb = nb_clf.predict(X_test)

In [17]:
print_metrics(y_test, y_pred_nb)

Accuracy: 0.9825
Precision: 0.9653
Recall: 0.9825
F1 Score: 0.9738


### KNN

In [18]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)

KNeighborsClassifier()

In [19]:
y_pred_knn = knn_clf.predict(X_test)

In [20]:
print_metrics(y_test, y_pred_knn)

Accuracy: 0.9817
Precision: 0.9696
Recall: 0.9817
F1 Score: 0.9742


### Decision tree

In [21]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(random_state=99)
dt_clf.fit(X_train, y_train)

DecisionTreeClassifier(random_state=99)

In [22]:
y_pred_dt = dt_clf.predict(X_test)

In [23]:
print_metrics(y_test, y_pred_dt)

Accuracy: 0.9762
Precision: 0.9717
Recall: 0.9762
F1 Score: 0.9737
