# ***Informacion del dataset***

In [1]:

%%html
<style> table { display: inline-block  }</style>

https://www.kaggle.com/datasets/riteshahlawat/covid19-mexico-patient-health-dataset



|col|0|1|97-98|
|---|---|---|---|
|sex|F|M|?|
|hospitalized|no|si|?|
|intubated|no|si|?|
|pneumonia|no|si|?|
|age|no|si|?|
|pregnant|no|si|?|
|diabetes|no|si|?|
|copd|no|si|?|
|asthma|no|si|?|
|immunosuppression|no|si|?|
|hypertension|no|si|?|
|other_diseases|no|si|?|
|cardiovascular|no|si|?|
|obesity|no|si|?|
|chronic_kidney_failure|no|si|?|
|smoker|no|si|?|
|another_case|no|si|?|
|outcome|no|si|?|
|icu|no|si|?|
|dead|no|si|?|

# **1. Importación de *modules***

In [7]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%config Completer.use_jedi = False
import warnings
warnings.filterwarnings("ignore")

import funciones as f

# **2. Importación del dataset *Patient.csv***


In [8]:
data = pd.read_csv("../data/patient.csv", sep = ",", low_memory=False) 
f._get_info(data)

   sex  patient_type  intubated  pneumonia  age  pregnant  diabetes  copd  \
0    2             1         97          2   42        97         2     2   
1    1             1         97          2   51         2         2     2   
2    2             2          2          2   51        97         1     2   

   asthma  immunosuppression  hypertension  other_diseases  cardiovascular  \
0       1                  2             2               2               2   
1       2                  2             2               2               2   
2       2                  2             1               2               2   

   obesity  chronic_kidney_failure  smoker  outcome  icu  death_date  
0        2                       2       2        1   97  9999-99-99  
1        2                       2       2        1   97  9999-99-99  
2        1                       2       2        1    2  9999-99-99  
(95252, 19)


# **3. Tratamiento de los datos**
Limpieza e imputaciones

### 3.1. Dropeo de columnas no informativas

In [9]:
#Dropeo icu, intubated, death_date
data=data.drop(['icu','intubated','death_date'], axis=1)
print(data.columns)
#f._get_info(data)
data.shape

Index(['sex', 'patient_type', 'pneumonia', 'age', 'pregnant', 'diabetes',
       'copd', 'asthma', 'immunosuppression', 'hypertension', 'other_diseases',
       'cardiovascular', 'obesity', 'chronic_kidney_failure', 'smoker',
       'outcome'],
      dtype='object')


(95252, 16)

In [10]:
#99 a null y dropeo
cols=data.columns.difference(['age'])

df=(data[cols] == 99)
ix=df[df.any(axis=1)].index
print(ix)
data.drop(df.index[ix], axis=0, inplace=True)
data.shape


Int64Index([161, 174, 592, 609, 644, 13322, 15671, 27316], dtype='int64')


(95244, 16)

### 3.2. Renombre de columnas ambiguas
Por ejemplo, patient_type a inpatient, outcome a covid, etc

In [11]:
data.rename(columns={"patient_type": "inpatient", "outcome": "covid"},inplace=True)

Matriz de correlación para regresión logística

### 3.3 Conversión de datos

In [12]:
#Para predictivas: 1 -> Y, 2 -> N, 
cols = data.loc[:, data.columns.difference(['age', 'sex','inpatient'])].columns
#print(cols) 
data[cols] = data[cols].replace([1,2],['Y','N'])
#f._get_info(data)

#en sex: 1 -> F y 2-> M
data['sex']= data['sex'].replace([1,2],['F','M'])
#f._get_info(data)

#en pregnant : 98 -> N, 97 -> NA
data['pregnant'] = data['pregnant'].replace([98,97],['N','NA'])
#f._get_info(data,5)

# outcome: 3 -> NA
data['covid'] = data['covid'].replace([3],['NA'])
#f._get_info(data)

##Para target (inpatient): 1 -> 0, 2 -> 1
data['sex']= data['sex'].replace([1,2],[0,1])
#f._get_info(data)
data.head(5)

Unnamed: 0,sex,inpatient,pneumonia,age,pregnant,diabetes,copd,asthma,immunosuppression,hypertension,other_diseases,cardiovascular,obesity,chronic_kidney_failure,smoker,covid
0,M,1,N,42,,N,N,Y,N,N,N,N,N,N,N,Y
1,F,1,N,51,N,N,N,N,N,N,N,N,N,N,N,Y
2,M,2,N,51,,Y,N,N,N,Y,N,N,Y,N,N,Y
3,M,2,N,57,,Y,N,N,N,N,N,N,N,N,N,Y
4,F,2,N,44,N,Y,N,N,N,N,N,N,N,N,N,N


In [13]:
data.groupby(["sex","pregnant"])["pregnant"].count()

pregnant
N     45914
NA    48354
Y       976
Name: sex, dtype: int64

### 3.4. Tabla de counts

### 3.5. Tabla pivot por inpatient

### 3.6. Creación de dummies

# **4. Exportación del dataset resultante a un nuevo *.csv***

In [18]:
# para exportar resultados
data.to_csv(r'../data/patient_covid.csv', index = False, header=True)
data.shape

(95244, 16)