In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("data-1649721068217.csv.gz")

In [3]:
data.rename(columns={"an_hcv": "HIC_antibodies",
                    "an_hiv": "HIV",
                    "an_hbsag": "HbsAg",
                    "ap_nuidade": "Age", 
                    "ap_coduni": "Hc_u",
                    "ap_pripal": "Procedure",
                    "ap_motsai": "r_f_d",
                    "estado": "State",
                    "an_tru": "u_red_r",
                    "an_intfis": "v_f_amount",
                    "an_diures": "Vlm",
                    "ap_cidpri": "label"}, inplace=True)

In [4]:
data["label"] = data["label"].str.strip()

In [5]:
def to_bool(x):
    if x == 'N':
        x = 0
    else:
        x = 1
    return x

In [6]:
def trim(x):
    x = x.strip(' ')
    x = x.lstrip('0')
    if len(x) == 0:
        x = 0
    else:
        if ',' in x:
            #print(x)
            tokens = x.strip(',')
            if len(tokens) > 0:
                x = tokens[0]
            else:
                x = 0
        if x == '-':
            x = 0
        try:
            x = int(x)
        except:
            #print(x)
            x = 0
    return(x)

In [7]:
data["HIC_antibodies"] = data["HIC_antibodies"].apply(to_bool)
data["HIV"] = data["HIV"].apply(to_bool)
data["HbsAg"] = data["HbsAg"].apply(to_bool)

In [8]:
labels = ["E102", "E142", "I10", "I120", "N039", "N083", "N088", "N180", "N188", "N189"]

In [9]:
data_f = data[data["label"].isin(labels)]

In [10]:
data_f['Vlm'] = data_f['Vlm'].apply(trim)
data_f['v_f_amount'] = data_f['v_f_amount'].apply(trim)
data_f['u_red_r'] = data_f['u_red_r'].apply(trim)

data_f['Vlm'] = data_f['Vlm'].astype(int)
data_f['v_f_amount'] = data_f['v_f_amount'].astype(int)
data_f['u_red_r'] = data_f['u_red_r'].astype(int)

In [11]:
d_f = data_f[(data_f["Vlm"] != 0) | (data_f["v_f_amount"] != 0) | (data_f["u_red_r"] != 0)]

In [12]:
import warnings
warnings.filterwarnings("ignore")

d_f["label"] = d_f["label"].astype('category')
d_f["label_cat"] = d_f["label"].cat.codes
d_f = pd.get_dummies(d_f, columns=["Hc_u"])
d_f = pd.get_dummies(d_f, columns=["Procedure"])
d_f = pd.get_dummies(d_f, columns=["State"])
d_f.head()

Unnamed: 0,HIC_antibodies,HIV,HbsAg,Age,r_f_d,u_red_r,v_f_amount,Vlm,label,label_cat,...,State_PR,State_RJ,State_RN,State_RO,State_RR,State_RS,State_SC,State_SE,State_SP,State_TO
1,0,0,0,24,21,72,0,0,N180,7,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,40,21,71,0,0,N180,7,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,65,21,63,0,0,N180,7,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,39,21,59,0,0,N180,7,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,49,21,63,0,0,N180,7,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df_s = d_f.sample(frac=0.1)
print(len(df_s))
print(len(d_f))
Y = df_s["label_cat"]
X = df_s.drop(['label', 'label_cat'], axis=1)

339342
3393421


In [14]:
X.head()

Unnamed: 0,HIC_antibodies,HIV,HbsAg,Age,r_f_d,u_red_r,v_f_amount,Vlm,Hc_u_004a13c3db6768e6,Hc_u_014bfb51b0b9c3e3,...,State_PR,State_RJ,State_RN,State_RO,State_RR,State_RS,State_SC,State_SE,State_SP,State_TO
705234,0,0,0,27,21,70,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
135141,0,0,0,66,21,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
555226,0,0,0,44,21,63,1,800,0,0,...,0,0,0,0,0,0,0,0,1,0
4351546,0,0,0,60,21,182,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
952851,0,0,0,29,21,67,0,180,0,0,...,0,0,0,0,0,0,0,0,1,0


In [15]:
num_classes = len(Y.unique())
print("number of classes:", num_classes)

number of classes: 10


In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    Y,
    test_size=0.2,
    random_state=42,
)

In [17]:
num_train_classes = len(y_train.unique())
num_test_classes = len(y_test.unique())
print("number of train classes:", num_train_classes)
print("number of test classes:", num_test_classes)

number of train classes: 10
number of test classes: 10


In [18]:
%%time
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(min_samples_split=20, random_state=99)
dt_clf.fit(X_train, y_train)

CPU times: user 1min 54s, sys: 935 ms, total: 1min 55s
Wall time: 1min 55s


DecisionTreeClassifier(min_samples_split=20, random_state=99)

In [19]:
y_pred_dt = dt_clf.predict(X_test)

In [20]:
from sklearn.metrics import accuracy_score, \
precision_recall_fscore_support, recall_score, f1_score
def print_metrics(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='macro')
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {p:.4f}")
    print(f"Recall: {r:.4f}")
    print(f"F1 Score: {f1:.4f}")

In [21]:
print_metrics(y_test, y_pred_dt)

Accuracy: 0.9815
Precision: 0.4102
Recall: 0.2738
F1 Score: 0.3213


In [22]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)

KNeighborsClassifier()

In [23]:
y_pred_knn = knn_clf.predict(X_test)

In [24]:
print_metrics(y_test, y_pred_knn)

Accuracy: 0.9821
Precision: 0.3053
Recall: 0.1264
F1 Score: 0.1452
