## Tabla de Contenidos

1. <a href='#section_Importacion_de_modules'>Importación de modules</a>
2. <a href='#section_Importacion_de_dataset'>Importación del dataset patient_covid.csv</a>
3. <a href='#section_Modelos'>Modelos</a> 
</br>3.1 <a href='#section_KNN'>KNN</a>
    </br>3.1.1. <a href='#section_preparacion'>Preparar la matriz de _features_ y el vector _target_</a>
</br>3.1.1. <a href='#section_optimizando'>Optimizando el valor de _k_</a>
</br>3.1.2. <a href="#section_confusion">Matriz de confusión</a>
</br>3.1.3. <a href="#section_conclusion">Conclusión</a>


# **1. Importación de *modules***
<a id="section_Importacion_de_modules"></a>

In [25]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.model_selection import GridSearchCV,train_test_split,cross_val_score, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score ,f1_score ,classification_report , confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import normalize, StandardScaler


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%config Completer.use_jedi = False
import warnings
warnings.filterwarnings("ignore")

import funciones as f

# **2. Importación del dataset patient_covid.csv**
<a id="section_Importacion_de_dataset"></a>

In [26]:
data_covid = pd.read_csv("../data/patient_covid.csv", sep = ",", low_memory=False) 
f._get_info(data_covid)

data_covid_knn = pd.read_csv("../data/patient_covid_knn.csv", sep = ",", low_memory=False) 
f._get_info(data_covid_knn)

   inpatient  agegroup_10-19  agegroup_20-29  agegroup_30-39  agegroup_40-49  \
0          0               0               0               0               1   
1          0               0               0               0               0   
2          1               0               0               0               0   

   agegroup_50-59  agegroup_60-69  agegroup_70-79  agegroup_80-89  \
0               0               0               0               0   
1               1               0               0               0   
2               1               0               0               0   

   agegroup_90-99  ...  diabetes_Y  hypertension_Y  immunosuppression_Y  \
0               0  ...           0               0                    0   
1               0  ...           0               0                    0   
2               0  ...           1               1                    0   

   obesity_Y  other_diseases_N  other_diseases_Y  pneumonia_Y  pregnant_Y  \
0          0            

# **3.Modelos**
<a id="section_Modelos"></a>

## 3.1<a id="section_Modelos"></a>. KNN  (Marce y Dani)
<a id="section_KNN"></a>

#### Preparar la matriz de _features_ y el vector _target_
<a id="section_preparacion"></a>


In [27]:
data_covid_knn['inpatient'].value_counts(normalize=True)

0    0.734629
1    0.265371
Name: inpatient, dtype: float64

In [28]:
cols_features=data_covid_knn.columns.difference(['inpatient','agegroup'])
X = data_covid_knn[cols_features]
y = data_covid_knn['inpatient']
X.head(3) 
X.columns

Unnamed: 0,age,asthma,cardiovascular,chronic_kidney_failure,copd,covid,diabetes,hypertension,immunosuppression,obesity,other_diseases,pneumonia,pregnant,sex,smoker
0,42,1,0,0,0,1,0,0,0,0,0,0,0,0,0
1,51,0,0,0,0,1,0,0,0,0,0,0,0,1,0
2,51,0,0,0,0,1,1,1,0,1,0,0,0,0,0


Index(['age', 'asthma', 'cardiovascular', 'chronic_kidney_failure', 'copd',
       'covid', 'diabetes', 'hypertension', 'immunosuppression', 'obesity',
       'other_diseases', 'pneumonia', 'pregnant', 'sex', 'smoker'],
      dtype='object')

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,stratify=data_covid['inpatient'])
display(y_train.value_counts(normalize=True).round(2))
display(y_test.value_counts(normalize=True).round(2))

0    0.73
1    0.27
Name: inpatient, dtype: float64

0    0.73
1    0.27
Name: inpatient, dtype: float64

In [30]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

#### ***Modelo Base***

In [31]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred).round(2)

KNeighborsClassifier()

0.57

#### ***Optimizando el valor de k***

In [None]:
# Definimos la estrategia de validación cruzada
kf = KFold(n_splits=5, shuffle=True, random_state=12)
scores_para_df_standard = []

for i in range(10, 31):
    model = KNeighborsClassifier(n_neighbors=i)
    cv_scores = cross_val_score(model, X_train, y_train, cv=kf)
    dict_row_score = {'score_medio':np.mean(cv_scores),
                      'score_std':np.std(cv_scores), 'n_neighbors':i}
    scores_para_df_standard.append(dict_row_score)

In [None]:
# Creamos el DataFrame a partir de la lista de diccionarios
df_scores_standard = pd.DataFrame(scores_para_df_standard)
df_scores_standard.head()

Graficando la búsqueda del mejor hiperparámetro

In [None]:
# Generamos los límites inferior y superior
df_scores_standard['limite_superior'] = df_scores_standard['score_medio'] + df_scores_standard['score_std']
df_scores_standard['limite_inferior'] = df_scores_standard['score_medio'] - df_scores_standard['score_std']
df_scores_standard.head()

In [None]:
# Graficamos los resultados
plt.plot(df_scores_standard['n_neighbors'], df_scores_standard['limite_inferior'], color='r')
plt.plot(df_scores_standard['n_neighbors'], df_scores_standard['score_medio'], color='b')
plt.plot(df_scores_standard['n_neighbors'], df_scores_standard['limite_superior'], color='r');

In [None]:
# Identificamos el score máximo
df_scores_standard.loc[df_scores_standard.score_medio == df_scores_standard.score_medio.max()]

In [None]:
# Asignamos el valor del k óptimo a una variable
best_k = df_scores_standard.loc[df_scores_standard.score_medio == df_scores_standard.score_medio.max(), 'n_neighbors'].values[0]
print("best k",best_k)

# Elegimos el modelo óptimo de acuerdo a las pruebas de cross validation
model = KNeighborsClassifier(n_neighbors=best_k)

# Lo ajustamos sobre los datos de entrenamiento
model.fit(X_train, y_train)

# Evaluamos qué accuracy obtenemos en train
print("accuracy_score en train",accuracy_score(y_train, model.predict(X_train)).round(2))

In [None]:
# Lo utilizamos para predecir en test
X_test = scaler.transform(X_test) # ¡Importantísimo estandarizar también los datos de test con las medias y desvíos aprendidos en train!
y_pred = model.predict(X_test)

# Evaluamos el accuracy del modelo en test
print("accuracy_score en test",accuracy_score(y_test, y_pred).round(2))



<a id="section_confusion"></a>
## Matriz de confusión

In [None]:
# Obtenemos la matriz de confusión
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

#### Graficar la matriz de confusión

In [None]:
# Graficamos la matriz de confusión para visualizarla mejor
sns.heatmap(cm, annot=True)
plt.ylabel('Etiquetas reales')
plt.xlabel('Etiquetas predichas');

In [None]:
# Podemos calcular manualmente la exactitud de nuestro modelo...
((cm[0,0] + cm[1,1]) / len(y_pred)).round(2)

In [None]:
# ... o computarla utilizando el método del accuracy score
accuracy_score(y_test, y_pred).round(2)

<a id="section_conclusion"></a>
## Conclusión

## 4.1. Regresión logística  (Mae)

### 4.2.1. Búsqueda y selección del hiperparámetro

### 4.1. Naive Bayes multinomial (Enzo)


In [None]:
## 4.1.1. Categorización de edad (o no :/)