In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("data-1649721068217.csv.gz")

In [3]:
data.rename(columns={"an_hcv": "HIC_antibodies",
                    "an_hiv": "HIV",
                    "an_hbsag": "HbsAg",
                    "ap_nuidade": "Age", 
                    "ap_coduni": "Hc_u",
                    "ap_pripal": "Procedure",
                    "ap_motsai": "r_f_d",
                    "estado": "State",
                    "an_tru": "u_red_r",
                    "an_intfis": "v_f_amount",
                    "an_diures": "Vlm",
                    "ap_cidpri": "label"}, inplace=True)

In [4]:
data["label"] = data["label"].str.strip()

In [5]:
def to_bool(x):
    if x == 'N':
        x = 0
    else:
        x = 1
    return x

In [6]:
def trim(x):
    x = x.strip(' ')
    x = x.lstrip('0')
    if len(x) == 0:
        x = 0
    else:
        if ',' in x:
            #print(x)
            tokens = x.strip(',')
            if len(tokens) > 0:
                x = tokens[0]
            else:
                x = 0
        if x == '-':
            x = 0
        try:
            x = int(x)
        except:
            #print(x)
            x = 0
    return(x)

In [7]:
data["HIC_antibodies"] = data["HIC_antibodies"].apply(to_bool)
data["HIV"] = data["HIV"].apply(to_bool)
data["HbsAg"] = data["HbsAg"].apply(to_bool)

In [8]:
labels = ["E102", "E142", "I10", "I120", "N039", "N083", "N088", "N180", "N188", "N189"]

In [9]:
data_f = data[data["label"].isin(labels)]

In [10]:
data_f['Vlm'] = data_f['Vlm'].apply(trim)
data_f['v_f_amount'] = data_f['v_f_amount'].apply(trim)
data_f['u_red_r'] = data_f['u_red_r'].apply(trim)

data_f['Vlm'] = data_f['Vlm'].astype(int)
data_f['v_f_amount'] = data_f['v_f_amount'].astype(int)
data_f['u_red_r'] = data_f['u_red_r'].astype(int)

In [11]:
d_f = data_f[(data_f["Vlm"] != 0) | (data_f["v_f_amount"] != 0) | (data_f["u_red_r"] != 0)]
for l in labels:
    d_f_l = d_f[d_f["label"] == l]
    print(f"{l}:{len(d_f_l)}")

E102:968
E142:1809
I10:4224
I120:14365
N039:5310
N083:4480
N088:1381
N180:3333766
N188:3824
N189:23294


In [12]:
for l in labels:
    d_f = data_f[data_f["label"] == l]
    d_f = d_f[(d_f["Vlm"] != 0) | (d_f["v_f_amount"] != 0) | (d_f["u_red_r"] != 0)]
    d_f = d_f[(d_f["Vlm"] != 0) | (d_f["u_red_r"] != 0)]
    print(f"{l}:{len(d_f)}")

E102:940
E142:1701
I10:3453
I120:13385
N039:5007
N083:4025
N088:592
N180:3169738
N188:3673
N189:19063


In [13]:
d_f = data_f[(data_f["Vlm"] != 0) | (data_f["v_f_amount"] != 0) | (data_f["u_red_r"] != 0)]

In [14]:
import warnings
warnings.filterwarnings("ignore")

d_f["label"] = d_f["label"].astype('category')
d_f["label_cat"] = d_f["label"].cat.codes
d_f = pd.get_dummies(d_f, columns=["Hc_u"])
d_f = pd.get_dummies(d_f, columns=["Procedure"])
d_f = pd.get_dummies(d_f, columns=["State"])
d_f.head()

Unnamed: 0,HIC_antibodies,HIV,HbsAg,Age,r_f_d,u_red_r,v_f_amount,Vlm,label,label_cat,...,State_PR,State_RJ,State_RN,State_RO,State_RR,State_RS,State_SC,State_SE,State_SP,State_TO
1,0,0,0,24,21,72,0,0,N180,7,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,40,21,71,0,0,N180,7,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,65,21,63,0,0,N180,7,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,39,21,59,0,0,N180,7,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,49,21,63,0,0,N180,7,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_s = d_f.sample(frac=0.1)
print(len(df_s))
print(len(d_f))
Y = df_s["label_cat"]
X = df_s.drop(['label', 'label_cat'], axis=1)

339342
3393421


In [None]:
X.head()