# Preparación ambiente

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings("ignore")

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Importación dataset

In [None]:
old_data=pd.read_csv("patient.csv")

In [None]:
old_data.columns

In [None]:
#Dropeo icu, intubated, death_date
data=old_data.drop(['icu','intubated','death_date'], axis=1)
print(data.columns)
data.shape

In [None]:
#99 a null y dropeo
cols=data.columns.difference(['age'])

df=(data[cols] == 99)
ix=df[df.any(axis=1)].index
print(ix)
data.drop(df.index[ix], axis=0, inplace=True)
data.shape


In [None]:
data.rename(columns={"patient_type": "inpatient", "outcome": "covid"},inplace=True)

In [None]:
data.groupby("pregnant")["sex"].sum()

In [None]:
#Para predictivas: 1 -> Y, 2 -> N, 
cols = data.loc[:, data.columns.difference(['age', 'sex','inpatient'])].columns
#print(cols) 
data[cols] = data[cols].replace([1,2],['Y','N'])

#en sex: 1 -> M y 2-> F
data['sex']= data['sex'].replace([1,2],['M','F'])

#en pregnant : 98 -> N, 97 -> NA
data['pregnant'] = data['pregnant'].replace([98,97],['N','NA'])

# outcome: 3 -> NA
data['covid'] = data['covid'].replace([3],['NA'])

##Para target (inpatient): 1 -> 0, 2 -> 1
data['sex']= data['sex'].replace([1,2],[0,1])
data.head(5)

In [None]:
old_data = old_data.drop(["icu", "intubated", "dead"], axis=1)

In [None]:
print(old_data.nunique())
print(old_data.shape)

In [None]:
old_data.info()

In [None]:
old_data = old_data.assign(agegroup=pd.cut(old_data['age'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100], right=False, labels=["0-9","10-19","20-29","30-39","40-49","50-59","60-69","70-79","80-89","90-99"]))

# Dataset casi completo

## Creación variables X e Y

Eliminamos las columnas death_date, intubated y icu (por su correlación lineal con intubated)

In [None]:
x=old_data.drop(["hospitalized"],axis=1)
y=old_data["hospitalized"]
y=pd.DataFrame(y)
y

In [None]:
x = pd.get_dummies(data=x, columns= x.columns,drop_first = True)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1)
print ("X_train:" , x_train.shape[0])
print ("y_train:" , y_train.shape[0])
print ("X_test:" , x_test.shape[0])
print ("y_test:" , y_test.shape[0])

In [None]:
y_train=pd.DataFrame(y_train)
y_train

In [None]:
y_train=y_train.astype('float')
y_test=y_test.astype('float')

## Machine Learning Algorithms

In [None]:
#Naive Bayes
nb=MultinomialNB()
nb.fit(x_train, y_train)
score_nb=nb.score(x_test,y_test)

#KNN
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train,y_train)
score_knn=knn.score(x_test,y_test)

#Score
print ("KNN Acc Score:",score_knn)
print ("NB Score:", score_nb)

In [None]:
y_pred_nb = nb.predict(x_test)
y_pred_knn = knn.predict(x_test)

In [None]:
print("AS NB:\n", accuracy_score(y_test, y_pred_nb))
print("AS KNN:\n", accuracy_score(y_test, y_pred_knn))

## Confusion_matrix

In [None]:
print("CM NB:\n", confusion_matrix(y_test, y_pred_nb))
print("CM KNN:\n", confusion_matrix(y_test, y_pred_knn))

## Recall,Precision and F-1 Score


* Recall    =  TP / (TP + FN)
* Precision = TP / (TP + FP) 
* F-Measure = (2 * Precision * Recall) / (Precision + Recall)

In [None]:
print("NB:\n", classification_report(y_test,y_pred_nb))
print("KNN:\n", classification_report(y_test,y_pred_knn))

In [None]:
print ("F1 Score NB:", f1_score(y_test,y_pred_nb))
print ("F1 Score KNN:", f1_score(y_test,y_pred_knn))

## roc & auc

In [None]:
#NB
probs_nb=nb.predict_proba(x_test)
preds_nb=probs_nb[:,1]
fpr_nb,tpr_nb, threshold_nb =metrics.roc_curve(y_test, y_pred_nb)
roc_auc_nb=metrics.auc(fpr_nb,tpr_nb)

#KNN
probs_knn=knn.predict_proba(x_test)
preds_knn=probs_knn[:,1]
fpr_knn,tpr_knn, threshold_knn =metrics.roc_curve(y_test, y_pred_knn)
roc_auc_knn=metrics.auc(fpr_knn,tpr_knn)

In [None]:
plt.title("ROC")

plt.plot(fpr_nb, tpr_nb, 'g', label="NB" % roc_auc_nb)
plt.plot(fpr_knn, tpr_knn, 'r', label="KNN" % roc_auc_nb)

plt.legend(loc="lower right")
plt.plot([0,1],[0,1],'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Pozitive Rate')
plt.xlabel('False Positive Rate')
plt.show()

## GridSearch

In [None]:
k_range = list(range(10, 21))
param_grid = dict(n_neighbors=k_range)
print(param_grid)

In [None]:
folds=StratifiedKFold(n_splits=10, random_state=19, shuffle=True)

In [None]:
grid = GridSearchCV(knn, param_grid, cv=folds, scoring='accuracy')

In [None]:
grid.fit(x_train, y_train)

In [None]:
pd.DataFrame(grid.cv_results_)

In [None]:
grid.best_estimator_

In [None]:
grid.best_score_

In [None]:
grid.best_params_

In [None]:
y_pred_grid = grid.predict(xd_test)

In [None]:
confusion = confusion_matrix(y_test, y_pred_grid)

In [None]:
confusion

In [None]:
print (classification_report(y_test, y_pred_grid))