In [24]:
import modin.pandas as pd
import os
os.environ["MODIN_ENGINE"] = "ray"
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from statsmodels.tsa.seasonal import seasonal_decompose
import random
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error as mae
import pandas.util.testing as tm
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from matplotlib import pyplot

In [39]:
cols = ['FEBRE','TOSSE','GARGANTA','DISPNEIA','DESC_RESP','SATURACAO','DIARREIA','VOMITO',
                     'DOR_ABD','FADIGA','PERD_OLFT','PERD_PALA','TOMO_RES', 'VACINA_COV','HOSPITAL',
                     'UTI','SUPORT_VEN','RAIOX_RES','AMOSTRA','CLASSI_FIN','VACINA','NU_IDADE_N']
df_train = pd.read_csv('train.csv', usecols=cols)

In [40]:
from dtype_diet import report_on_dataframe, optimize_dtypes
optimized_df = report_on_dataframe(df_train,unit = "MB")
df_train = optimize_dtypes(df_train,optimized_df)

In [41]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1300980 entries, 0 to 1300979
Data columns (total 22 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   NU_IDADE_N  1300980 non-null  int16  
 1   FEBRE       1069610 non-null  float16
 2   TOSSE       1128895 non-null  float16
 3   GARGANTA    904415 non-null   float16
 4   DISPNEIA    1124930 non-null  float16
 5   DESC_RESP   1039725 non-null  float16
 6   SATURACAO   1077358 non-null  float16
 7   DIARREIA    886757 non-null   float16
 8   VOMITO      878227 non-null   float16
 9   VACINA      916185 non-null   float16
 10  HOSPITAL    1274497 non-null  float16
 11  UTI         1138453 non-null  float16
 12  SUPORT_VEN  1136018 non-null  float16
 13  RAIOX_RES   768899 non-null   float16
 14  AMOSTRA     1256816 non-null  float16
 15  CLASSI_FIN  1300980 non-null  int8   
 16  DOR_ABD     861941 non-null   float16
 17  FADIGA      910992 non-null   float16
 18  PERD_OLFT   866856 non

In [42]:
# Drop the values with nan in Tomo_RES and SATURACAO
df = df_train.dropna(subset=['TOMO_RES','SATURACAO']).reset_index(drop=True)

In [43]:
df.groupby("TOMO_RES").count()

Unnamed: 0_level_0,NU_IDADE_N,FEBRE,TOSSE,GARGANTA,DISPNEIA,DESC_RESP,SATURACAO,DIARREIA,VOMITO,VACINA,...,UTI,SUPORT_VEN,RAIOX_RES,AMOSTRA,CLASSI_FIN,DOR_ABD,FADIGA,PERD_OLFT,PERD_PALA,VACINA_COV
TOMO_RES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,321009,288700,296190,258319,304082,290404,321009,256628,252878,263100,...,307979,307848,203849,311254,321009,251411,266593,254353,254038,253806
2.0,29385,26165,27152,24183,28023,26862,29385,24247,24177,24466,...,28499,28501,21138,29118,29385,24042,24934,24042,23980,24932
3.0,21790,19628,20269,18424,20887,20086,21790,18446,18390,17601,...,21028,21097,15488,21599,21790,18332,18790,18280,18274,19082
4.0,4862,4402,4500,4137,4571,4401,4862,4094,4109,4023,...,4722,4730,3578,4836,4862,4081,4178,4058,4054,4214
5.0,39417,34945,36402,32253,37283,35535,39417,32484,32372,31814,...,38334,38476,28389,39173,39417,32163,33073,32145,32230,33635
6.0,266668,251773,256759,239416,259776,255753,266668,239735,239224,219532,...,253384,262870,240677,264463,266668,238066,242465,238333,238080,236846
9.0,65076,60268,61629,56870,62943,61591,65076,56966,56677,57500,...,62493,64270,59147,64498,65076,56415,57527,56533,56470,57166


In [15]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1300980 entries, 0 to 1300979
Data columns (total 22 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   FEBRE       1069610 non-null  float64
 1   TOSSE       1128895 non-null  float64
 2   GARGANTA    904415 non-null   float64
 3   DISPNEIA    1124930 non-null  float64
 4   DESC_RESP   1039725 non-null  float64
 5   SATURACAO   1077358 non-null  float64
 6   DIARREIA    886757 non-null   float64
 7   VOMITO      878227 non-null   float64
 8   DOR_ABD     861941 non-null   float64
 9   FADIGA      910992 non-null   float64
 10  PERD_OLFT   866856 non-null   float64
 11  PERD_PALA   866336 non-null   float64
 12  TOMO_RES    845370 non-null   float64
 13  VACINA_COV  1051601 non-null  float64
 14  HOSPITAL    1274497 non-null  float64
 15  UTI         1138453 non-null  float64
 16  SUPORT_VEN  1136018 non-null  float64
 17  RAIOX_RES   768899 non-null   float64
 18  AMOSTRA     1256816 no

In [26]:
import scipy.stats as ss

def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

In [27]:
cat_cols = ['FEBRE','TOSSE','GARGANTA','DISPNEIA','DESC_RESP','SATURACAO','DIARREIA','VOMITO',
            'DOR_ABD','FADIGA','PERD_OLFT','PERD_PALA','TOMO_RES', 'VACINA_COV','HOSPITAL',
            'UTI','SUPORT_VEN','RAIOX_RES','AMOSTRA','TP_AMOSTRA','VACINA']
corr_matrix = pd.DataFrame(index=cat_cols, columns=['CLASSI_FIN'])
for col in cat_cols:
    corr_matrix.loc[col, 'CLASSI_FIN'] = cramers_v(df[col], df['CLASSI_FIN'])
    
# View the correlation matrix
print(corr_matrix)

  return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))


           CLASSI_FIN
FEBRE        0.067212
TOSSE         0.06042
GARGANTA     0.064383
DISPNEIA     0.048882
DESC_RESP    0.033999
SATURACAO     0.07093
DIARREIA     0.052892
VOMITO       0.031453
DOR_ABD      0.016782
FADIGA        0.08206
PERD_OLFT    0.101555
PERD_PALA    0.103704
TOMO_RES     0.213127
VACINA_COV   0.109211
HOSPITAL          NaN
UTI          0.067592
SUPORT_VEN    0.09006
RAIOX_RES    0.096925
AMOSTRA           0.0
TP_AMOSTRA   0.030514
VACINA       0.046326


In [None]:
for i in range(0,len(cols)):
    print(
        str(cols[i]) + ": " + str(np.corrcoef(cols[i], df["CLASSI_FIN"])[0][1] * 100) + " %"
    )

In [33]:
df = pd.get_dummies(df, columns=cols)

# Compute the correlation matrix
corr = df.corr()

# Plot the correlation matrix as a heatmap
sns.heatmap(corr, cmap='coolwarm', center=0, annot=True, fmt='.2f')

<AxesSubplot:>

In [14]:
df_train.groupby("TOMO_RES").count()

Unnamed: 0_level_0,FEBRE,TOSSE,GARGANTA,DISPNEIA,DESC_RESP,SATURACAO,DIARREIA,VOMITO,DOR_ABD,FADIGA,...,PERD_PALA,VACINA_COV,HOSPITAL,UTI,SUPORT_VEN,RAIOX_RES,AMOSTRA,TP_AMOSTRA,CLASSI_FIN,VACINA
TOMO_RES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,312685,325016,264814,328431,303033,321009,261868,256426,253949,274534,...,258091,284344,363662,347152,346030,219246,353052,325722,365356,287597
2.0,28074,29763,24759,30200,28284,29385,24683,24568,24347,25777,...,24188,28530,33736,32555,32497,23000,33513,32198,33853,26817
3.0,20843,21903,18782,22353,20873,21790,18721,18686,18542,19241,...,18419,21631,24754,23784,23795,16650,24594,23574,24856,19059
4.0,4911,5029,4315,4936,4560,4862,4226,4235,4178,4318,...,4098,5096,5901,5702,5680,4025,5886,5626,5924,4683
5.0,37372,39782,32885,39955,36710,39417,33054,32987,32616,33784,...,32599,38664,45164,43655,43852,31176,44995,43074,45323,34492
6.0,266572,274670,244038,272139,263163,266668,242950,242901,240365,246124,...,239792,261983,294393,279885,290396,262811,292974,277135,295660,237115
9.0,64608,67013,58330,67032,63847,65076,57881,57563,57079,58643,...,57054,65234,74115,71152,73312,66299,73656,69177,74398,64323


In [12]:
df_train.groupby('CLASSI_FIN').count()

Unnamed: 0_level_0,FEBRE,TOSSE,GARGANTA,DISPNEIA,DESC_RESP,SATURACAO,DIARREIA,VOMITO,DOR_ABD,FADIGA,...,PERD_PALA,TOMO_RES,VACINA_COV,HOSPITAL,UTI,SUPORT_VEN,RAIOX_RES,AMOSTRA,TP_AMOSTRA,VACINA
CLASSI_FIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,12956,13836,10297,12274,11247,11503,9477,9562,9303,9755,...,9206,8522,15498,15004,12512,12570,8384,15029,14600,9603
2,27579,29568,21935,27032,26901,26060,21916,22408,21468,21849,...,21244,17891,29471,31263,29767,29470,24052,31417,30911,16694
3,4556,4821,3800,4746,4454,4451,3730,3762,3693,3820,...,3636,3564,5245,5588,4773,5044,3460,5550,5153,3066
4,309841,329570,265104,326747,304638,307657,260950,262502,256393,264701,...,252981,221608,328030,371651,320672,321864,235831,370883,346570,251061
5,714678,751100,603279,754131,692485,727687,590684,579993,571084,610867,...,579269,593785,673357,850991,770729,767070,497172,833937,785697,635761


In [13]:
df_train[df_train['CLASSI_FIN'] == 5].groupby("TOMO_RES").count()

Unnamed: 0_level_0,FEBRE,TOSSE,GARGANTA,DISPNEIA,DESC_RESP,SATURACAO,DIARREIA,VOMITO,DOR_ABD,FADIGA,...,PERD_PALA,VACINA_COV,HOSPITAL,UTI,SUPORT_VEN,RAIOX_RES,AMOSTRA,TP_AMOSTRA,CLASSI_FIN,VACINA
TOMO_RES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,292937,304222,247406,307255,283244,300793,244708,239365,237010,256689,...,241225,265080,340026,324890,323775,204239,329792,303974,341583,268769
2.0,12830,13531,11126,13625,12662,13317,11081,10992,10851,11585,...,10894,12519,15423,14884,14829,10325,15294,14796,15464,12251
3.0,7611,7939,6829,7897,7444,7782,6797,6767,6719,6992,...,6693,7639,8916,8594,8546,5780,8823,8482,8952,7022
4.0,1888,1910,1597,1817,1680,1802,1566,1565,1531,1587,...,1505,2022,2320,2249,2226,1448,2316,2235,2330,1791
5.0,21176,22627,18623,22534,20513,22439,18744,18649,18410,19095,...,18556,21092,25744,24770,24998,17331,25643,24698,25819,19302
6.0,138682,142549,128041,142954,137124,140617,127065,126215,125348,129245,...,125838,132384,153671,146386,151841,136264,152677,145477,154506,131474
9.0,38855,40269,34866,40504,38450,39475,34590,34231,33956,35171,...,34142,38557,44954,43380,44462,39930,44680,42378,45131,40341


In [3]:
df_train[df_train['CLASSI_FIN'] == 5].groupby("RAIOX_RES").count()


Unnamed: 0_level_0,SEM_NOT,SEM_PRI,SG_UF_NOT,ID_REGIONA,CO_REGIONA,ID_MUNICIP,CO_MUN_NOT,CS_SEXO,NU_IDADE_N,TP_IDADE,...,PERD_PALA,TOMO_RES,TOMO_OUT,VACINA_COV,DOSE_1_COV,DOSE_2_COV,DOSE_REF,FNT_IN_COV,DELTA_UTI,ID
RAIOX_RES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,15134,15134,15134,13780,13780,15134,15134,15134,15134,15134,...,11707,10797,378,12793,15134,15134,15134,12794,15134,15134
2.0,96959,96959,96959,91709,91709,96959,96959,96959,96959,96959,...,75106,63627,1008,78080,96959,96959,96959,78091,96959,96959
3.0,11354,11354,11354,10363,10363,11354,11354,11354,11354,11354,...,8618,8646,270,9280,11354,11354,11354,9281,11354,11354
4.0,12343,12343,12343,11818,11818,12343,12343,12343,12343,12343,...,9872,9346,148,10241,12343,12343,12343,10241,12343,12343
5.0,31585,31585,31585,28640,28640,31585,31585,31585,31585,31585,...,24597,25658,2525,26190,31585,31585,31585,26195,31585,31585
6.0,249388,249388,249388,223692,223692,249388,249388,249388,249388,249388,...,205065,227457,9066,206929,249388,249388,249388,206945,249388,249388
9.0,80409,80409,80409,63976,63976,80409,80409,80409,80409,80409,...,63431,69786,1298,68354,80409,80409,80409,68361,80409,80409


In [21]:
from dtype_diet import report_on_dataframe, optimize_dtypes
optimized_df = report_on_dataframe(df_train,unit = "MB")
df_train_optimized = optimize_dtypes(df_train,optimized_df)
print(f'Original df memory:{df_train.memory_usage(deep=True).sum()/1024/1024} MB')
print(f'Original df memory:{df_train_optimized.memory_usage(deep=True).sum()/1024/1024} MB')


Original df memory:218.36529541015625 MB
Original df memory:53.350704193115234 MB


In [18]:
341583/(15464+8952+2333+26000+341583)

0.8662320075469402

In [12]:
df_train["CLASSI_FIN"==5].groupby('SATURACAO').count()

KeyError: False

In [7]:
df_train.groupby('SATURACAO').count()

Unnamed: 0_level_0,SEM_NOT,SEM_PRI,SG_UF_NOT,ID_REGIONA,CO_REGIONA,ID_MUNICIP,CO_MUN_NOT,CS_SEXO,NU_IDADE_N,TP_IDADE,...,PERD_PALA,TOMO_RES,TOMO_OUT,VACINA_COV,DOSE_1_COV,DOSE_2_COV,DOSE_REF,FNT_IN_COV,DELTA_UTI,ID
SATURACAO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,788268,788268,788268,715154,715154,788268,788268,788268,788268,788268,...,568671,545909,24675,639666,788268,788268,788268,639868,788268,788268
2.0,274740,274740,274740,244362,244362,274740,274740,274740,274740,274740,...,271042,192917,7705,234820,274740,274740,274740,234852,274740,274740
9.0,14350,14350,14350,11762,11762,14350,14350,14350,14350,14350,...,14087,9381,362,12242,14350,14350,14350,12244,14350,14350


In [7]:
print(df_train.columns)

Index(['SEM_NOT', 'SEM_PRI', 'SG_UF_NOT', 'ID_REGIONA', 'CO_REGIONA',
       'ID_MUNICIP', 'CO_MUN_NOT', 'CS_SEXO', 'NU_IDADE_N', 'TP_IDADE',
       'COD_IDADE', 'CS_GESTANT', 'CS_RACA', 'CS_ESCOL_N', 'SG_UF', 'CS_ZONA',
       'SURTO_SG', 'NOSOCOMIAL', 'AVE_SUINO', 'FEBRE', 'TOSSE', 'GARGANTA',
       'DISPNEIA', 'DESC_RESP', 'SATURACAO', 'DIARREIA', 'VOMITO', 'OUTRO_SIN',
       'OUTRO_DES', 'PUERPERA', 'FATOR_RISC', 'CARDIOPATI', 'HEMATOLOGI',
       'SIND_DOWN', 'HEPATICA', 'ASMA', 'DIABETES', 'NEUROLOGIC', 'PNEUMOPATI',
       'IMUNODEPRE', 'RENAL', 'OBESIDADE', 'OBES_IMC', 'OUT_MORBI',
       'MORB_DESC', 'VACINA', 'MAE_VAC', 'M_AMAMENTA', 'ANTIVIRAL',
       'TP_ANTIVIR', 'HOSPITAL', 'UTI', 'SUPORT_VEN', 'RAIOX_RES', 'RAIOX_OUT',
       'AMOSTRA', 'TP_AMOSTRA', 'OUT_AMOST', 'CLASSI_FIN', 'HISTO_VGM',
       'PAC_COCBO', 'PAC_DSCBO', 'OUT_ANIM', 'DOR_ABD', 'FADIGA', 'PERD_OLFT',
       'PERD_PALA', 'TOMO_RES', 'TOMO_OUT', 'VACINA_COV', 'DOSE_1_COV',
       'DOSE_2_COV', 'DOSE_REF