# Preparación ambiente

In [1]:
#from google.colab import drive
#drive.mount('/content/drive')

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Importación dataset

In [3]:
data=pd.read_csv("../Data/patient.csv")


In [4]:
print(data.nunique())
print(data.shape)

sex                         2
patient_type                2
intubated                   4
pneumonia                   3
age                       104
pregnant                    4
diabetes                    3
copd                        2
asthma                      2
immunosuppression           2
hypertension                2
other_diseases              3
cardiovascular              2
obesity                     2
chronic_kidney_failure      3
smoker                      2
outcome                     3
icu                         4
death_date                 54
dtype: int64
(95252, 19)


# Preparación Datos

In [5]:
#Dropeo icu, intubated, death_date
data=data.drop(['icu','intubated','death_date'], axis=1)
print(data.columns)
data.shape

Index(['sex', 'patient_type', 'pneumonia', 'age', 'pregnant', 'diabetes',
       'copd', 'asthma', 'immunosuppression', 'hypertension', 'other_diseases',
       'cardiovascular', 'obesity', 'chronic_kidney_failure', 'smoker',
       'outcome'],
      dtype='object')


(95252, 16)

In [6]:
#99 a null y dropeo
cols=data.columns.difference(['age'])

df=(data[cols] == 99)
ix=df[df.any(axis=1)].index
print(ix)
data.drop(df.index[ix], axis=0, inplace=True)
data.shape

Int64Index([161, 174, 592, 609, 644, 13322, 15671, 27316], dtype='int64')


(95244, 16)

In [7]:
#Renombro columnas
data.rename(columns={"patient_type": "inpatient", "outcome": "covid"},inplace=True)

In [8]:
#Para predictivas: 1 -> Y, 2 -> N, 
cols = data.loc[:, data.columns.difference(['age', 'sex','inpatient'])].columns
#print(cols) 
data[cols] = data[cols].replace([1,2],['Y','N'])

#en sex: 1 -> F y 2-> M
data['sex']= data['sex'].replace([1,2],['F','M'])

#en pregnant : 98 -> N, 97 -> NA
data['pregnant'] = data['pregnant'].replace([98,97],['N','NA'])

# outcome: 3 -> NA
data['covid'] = data['covid'].replace([3],['NA'])

##Para target (inpatient): 1 -> 0, 2 -> 1
data['inpatient']= data['inpatient'].replace([1,2],[0,1])
data.head(5)

Unnamed: 0,sex,inpatient,pneumonia,age,pregnant,diabetes,copd,asthma,immunosuppression,hypertension,other_diseases,cardiovascular,obesity,chronic_kidney_failure,smoker,covid
0,M,1,N,42,,N,N,Y,N,N,N,N,N,N,N,Y
1,F,1,N,51,N,N,N,N,N,N,N,N,N,N,N,Y
2,M,2,N,51,,Y,N,N,N,Y,N,N,Y,N,N,Y
3,M,2,N,57,,Y,N,N,N,N,N,N,N,N,N,Y
4,F,2,N,44,N,Y,N,N,N,N,N,N,N,N,N,N


In [9]:
data.groupby(["sex","pregnant"])["pregnant"].count()

sex  pregnant
F    N           45914
     Y             976
M    NA          48354
Name: pregnant, dtype: int64

In [10]:
#creo categorias para edad por décadas (puede ser por etapa biologica niños-jovenes-adultos-ancianos)
data = data.assign(agegroup=pd.cut(data['age'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100], right=False, labels=["0-9","10-19","20-29","30-39","40-49","50-59","60-69","70-79","80-89","90-99"]))

In [11]:
data.columns

Index(['sex', 'inpatient', 'pneumonia', 'age', 'pregnant', 'diabetes', 'copd',
       'asthma', 'immunosuppression', 'hypertension', 'other_diseases',
       'cardiovascular', 'obesity', 'chronic_kidney_failure', 'smoker',
       'covid', 'agegroup'],
      dtype='object')

In [14]:
#elimino age para no tener dummies
data.drop(["age"], axis=1, inplace=True)

In [15]:
data.columns

Index(['sex', 'inpatient', 'pneumonia', 'pregnant', 'diabetes', 'copd',
       'asthma', 'immunosuppression', 'hypertension', 'other_diseases',
       'cardiovascular', 'obesity', 'chronic_kidney_failure', 'smoker',
       'covid', 'agegroup'],
      dtype='object')

In [16]:
#creo variables dummies de todas las columnas menos la target
data = pd.get_dummies(data=data, columns= data.columns.difference(["inpatient"]),drop_first = True)

In [17]:
data.columns

Index(['inpatient', 'agegroup_10-19', 'agegroup_20-29', 'agegroup_30-39',
       'agegroup_40-49', 'agegroup_50-59', 'agegroup_60-69', 'agegroup_70-79',
       'agegroup_80-89', 'agegroup_90-99', 'asthma_Y', 'cardiovascular_Y',
       'chronic_kidney_failure_N', 'chronic_kidney_failure_Y', 'copd_Y',
       'covid_NA', 'covid_Y', 'diabetes_N', 'diabetes_Y', 'hypertension_Y',
       'immunosuppression_Y', 'obesity_Y', 'other_diseases_N',
       'other_diseases_Y', 'pneumonia_Y', 'pregnant_NA', 'pregnant_Y', 'sex_M',
       'smoker_Y'],
      dtype='object')

## Creación variables X e Y

Eliminamos las columnas death_date, intubated y icu (por su correlación lineal con intubated)

In [19]:
x=data.drop(["inpatient"],axis=1)
y=data["inpatient"]
y=pd.DataFrame(y)
y

Unnamed: 0,inpatient
0,1
1,1
2,2
3,2
4,2
...,...
95247,2
95248,2
95249,2
95250,1


In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1)
print ("X_train:" , x_train.shape[0])
print ("y_train:" , y_train.shape[0])
print ("X_test:" , x_test.shape[0])
print ("y_test:" , y_test.shape[0])

In [None]:
y_train=pd.DataFrame(y_train)
y_train

In [None]:
y_train=y_train.astype('float')
y_test=y_test.astype('float')

## Machine Learning Algorithms

In [None]:
#Naive Bayes
nb=GaussianNB()
nb.fit(x_train, y_train)
score_nb=nb.score(x_test,y_test)

#KNN
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train,y_train)
score_knn=knn.score(x_test,y_test)

#Logistic regression
scaler = StandardScaler()
x_trainstd = scaler.fit_transform(x_train)
x_teststd = scaler.transform(x_test)
lr = LogisticRegression()
lr.fit(x_trainstd, y_train)
score_lr= lr.score(x_teststd, y_test)

#Score
print ("Naive Bayes Acc Score:",score_nb)
print ("KNN Acc Score:",score_knn)
print ("Logistic Regression Score:", score_lr)

In [None]:
y_pred_nb = nb.predict(x_test)
y_pred_knn = knn.predict(x_test)
y_pred_lr = lr.predict(x_teststd)

In [None]:
print("AS NB:\n", accuracy_score(y_test, y_pred_nb))
print("AS KNN:\n", accuracy_score(y_test, y_pred_knn))
print("AS LR:\n", accuracy_score(y_test, y_pred_lr))

## Confusion_matrix

In [None]:
print("CM NB:\n", confusion_matrix(y_test, y_pred_nb))
print("CM KNN:\n", confusion_matrix(y_test, y_pred_knn))
print("CM LR:\n", confusion_matrix(y_test, y_pred_lr))

## Recall,Precision and F-1 Score


* Recall    =  TP / (TP + FN)
* Precision = TP / (TP + FP) 
* F-Measure = (2 * Precision * Recall) / (Precision + Recall)

In [None]:
print("NB:\n", classification_report(y_test,y_pred_nb))
print("KNN:\n", classification_report(y_test,y_pred_knn))
print("LR:\n", classification_report(y_test,y_pred_lr))

In [None]:
print ("F1 Score NB:", f1_score(y_test,y_pred_nb))
print ("F1 Score KNN:", f1_score(y_test,y_pred_knn))
print ("F1 Score LR:", f1_score(y_test,y_pred_lr))

## roc & auc

In [None]:
#NB
probs_nb=nb.predict_proba(x_test)
preds_nb=probs_nb[:,1]
fpr_nb,tpr_nb, threshold_nb =metrics.roc_curve(y_test, y_pred_nb)
roc_auc_nb=metrics.auc(fpr_nb,tpr_nb)

#KNN
probs_knn=knn.predict_proba(x_test)
preds_knn=probs_knn[:,1]
fpr_knn,tpr_knn, threshold_knn =metrics.roc_curve(y_test, y_pred_knn)
roc_auc_knn=metrics.auc(fpr_knn,tpr_knn)

#LR
probs_lr=lr.predict_proba(x_teststd)
preds_lr=probs_lr[:,1]
fpr_lr,tpr_lr, threshold_lr =metrics.roc_curve(y_test, y_pred_lr)
roc_auc_lr=metrics.auc(fpr_lr,tpr_lr)

In [None]:
plt.title("ROC")

plt.plot(fpr_nb, tpr_nb, 'g', label="NB" % roc_auc_nb)
plt.plot(fpr_knn, tpr_knn, 'r', label="KNN" % roc_auc_nb)
plt.plot(fpr_lr, tpr_lr, 'b', label="LR" % roc_auc_nb)

plt.legend(loc="lower right")
plt.plot([0,1],[0,1],'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Pozitive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# Dataset con dummies

In [None]:
d_data=data

In [None]:
d_data.head()

In [None]:
d_data = d_data.assign(agegroup=pd.cut(d_data['age'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100], right=False, labels=["0-9","10-19","20-29","30-39","40-49","50-59","60-69","70-79","80-89","90-99"]))

In [None]:
d_data.columns

In [None]:
d_data = d_data.drop(["icu","age","intubated"],axis=1)

In [None]:
d_data.columns

In [None]:
xd=d_data.drop(["patient_type"],axis=1)
yd=d_data["patient_type"]
yd=pd.DataFrame(y)
yd

In [None]:
xd

In [None]:
xd = pd.get_dummies(data=xd, columns= xd.columns,drop_first = True)

In [None]:
xd

In [None]:
xd_train,xd_test,yd_train,yd_test=train_test_split(xd,yd,test_size=0.2,random_state=1)
print ("X_train:" , xd_train.shape[0])
print ("y_train:" , yd_train.shape[0])
print ("X_test:" , xd_test.shape[0])
print ("y_test:" , yd_test.shape[0])

In [None]:
yd_train=pd.DataFrame(yd_train)
yd_train

In [None]:
yd_train=yd_train.astype('float')
yd_test=yd_test.astype('float')

## Machine Learning Algorithms

In [None]:
#Navie Bayes
nb=GaussianNB()
nb.fit(xd_train, yd_train)
score_nbd=nb.score(xd_test,yd_test)

#KNN
knn=KNeighborsClassifier(n_neighbors=15)
knn.fit(xd_train,yd_train)
score_knnd=knn.score(xd_test,yd_test)

#Logistic regression
scaler = StandardScaler()
xd_trainstd = scaler.fit_transform(xd_train)
xd_teststd = scaler.transform(xd_test)
lr = LogisticRegression()
lr.fit(xd_trainstd, yd_train)
score_lrd= lr.score(xd_teststd, yd_test)

#Score
print ("Navie Bayes Acc Score:",score_nbd)
print ("KNN Acc Score:",score_knnd)
print("Logistic Regression Score:", score_lrd)

In [None]:
yd_pred_nb = nb.predict(xd_test)
yd_pred_knn = knn.predict(xd_test)
yd_pred_lr = lr.predict(xd_teststd)

## Confusion_matrix

In [None]:
print("CM NB:\n", confusion_matrix(yd_test, yd_pred_nb))
print("CM KNN:\n", confusion_matrix(yd_test, yd_pred_knn))
print("CM LR:\n", confusion_matrix(yd_test, yd_pred_lr))

## Recall,Precision and F-1 Score


* Recall    =  TP / (TP + FN)
* Precision = TP / (TP + FP) 
* F-Measure = (2 * Precision * Recall) / (Precision + Recall)

In [None]:
print("NB:\n", classification_report(yd_test,yd_pred_nb))
print("KNN:\n", classification_report(yd_test,yd_pred_knn))
print("LR:\n", classification_report(yd_test,yd_pred_lr))

In [None]:
print ("F1 Score NB:", f1_score(yd_test,yd_pred_nb))
print ("F1 Score KNN:", f1_score(yd_test,yd_pred_knn))
print ("F1 Score LR:", f1_score(yd_test,yd_pred_lr))

## roc & auc

In [None]:
#NB
probs_nbd=nb.predict_proba(xd_test)
preds_nbd=probs_nbd[:,1]
fpr_nbd,tpr_nbd, threshold_nbd =metrics.roc_curve(yd_test, yd_pred_nb)
roc_auc_nbd=metrics.auc(fpr_nbd,tpr_nbd)

#KNN
probs_knnd=knn.predict_proba(xd_test)
preds_knnd=probs_knnd[:,1]
fpr_knnd,tpr_knnd, threshold_knnd =metrics.roc_curve(yd_test, yd_pred_knn)
roc_auc_knnd=metrics.auc(fpr_knnd,tpr_knnd)

#LR
probs_lrd=lr.predict_proba(xd_teststd)
preds_lrd=probs_lrd[:,1]
fpr_lrd,tpr_lrd, threshold_lrd =metrics.roc_curve(yd_test, yd_pred_lr)
roc_auc_lrd=metrics.auc(fpr_lrd,tpr_lrd)

In [None]:
plt.title("ROC")

plt.plot(fpr_nbd, tpr_nbd, 'g', label="NB" % roc_auc_nbd)
plt.plot(fpr_knnd, tpr_knnd, 'r', label="KNN" % roc_auc_nbd)
plt.plot(fpr_lrd, tpr_lrd, 'b', label="LR" % roc_auc_nbd)

plt.legend(loc="lower right")
plt.plot([0,1],[0,1],'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Pozitive Rate')
plt.xlabel('False Positive Rate')
plt.show()

## GridSearch

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [None]:
k_range = list(range(1, 31))
param_grid = dict(n_neighbors=k_range)
print(param_grid)

In [None]:
folds=StratifiedKFold(n_splits=10, random_state=19, shuffle=True)

In [None]:
grid = GridSearchCV(knn, param_grid, cv=folds, scoring='accuracy')

In [None]:
grid.fit(xd_train, yd_train)

In [None]:
pd.DataFrame(grid.cv_results_)

In [None]:
grid.best_estimator_

In [None]:
grid.best_score_

In [None]:
grid.best_params_

In [None]:
y_pred_grid = grid.predict(xd_test)

In [None]:
confusion = confusion_matrix(y_test, y_pred_grid)

In [None]:
confusion

In [None]:
print (classification_report(y_test, y_pred_grid))