# Preparación ambiente

In [1]:
#from google.colab import drive
#drive.mount('/content/drive')

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Importación dataset

In [3]:
data=pd.read_csv("../Data/patient.csv")

In [4]:
print(data.nunique())
print(data.shape)

sex                         2
patient_type                2
intubated                   4
pneumonia                   3
age                       104
pregnant                    4
diabetes                    3
copd                        2
asthma                      2
immunosuppression           2
hypertension                2
other_diseases              3
cardiovascular              2
obesity                     2
chronic_kidney_failure      3
smoker                      2
outcome                     3
icu                         4
death_date                 54
dtype: int64
(95252, 19)


# Preparación Datos

In [5]:
#Dropeo icu, intubated, death_date
data=data.drop(['icu','intubated','death_date'], axis=1)
print(data.columns)
data.shape

Index(['sex', 'patient_type', 'pneumonia', 'age', 'pregnant', 'diabetes',
       'copd', 'asthma', 'immunosuppression', 'hypertension', 'other_diseases',
       'cardiovascular', 'obesity', 'chronic_kidney_failure', 'smoker',
       'outcome'],
      dtype='object')


(95252, 16)

In [6]:
#99 a null y dropeo
cols=data.columns.difference(['age'])

df=(data[cols] == 99)
ix=df[df.any(axis=1)].index
print(ix)
data.drop(df.index[ix], axis=0, inplace=True)
data.shape

Int64Index([161, 174, 592, 609, 644, 13322, 15671, 27316], dtype='int64')


(95244, 16)

In [7]:
#Renombro columnas
data.rename(columns={"patient_type": "inpatient", "outcome": "covid"},inplace=True)

In [8]:
#Para predictivas: 1 -> Y, 2 -> N, 
cols = data.loc[:, data.columns.difference(['age', 'sex','inpatient'])].columns
#print(cols) 
data[cols] = data[cols].replace([1,2],['Y','N'])

#en sex: 1 -> F y 2-> M
data['sex']= data['sex'].replace([1,2],['F','M'])

#en pregnant : 98 -> N, 97 -> NA
data['pregnant'] = data['pregnant'].replace([98,97],['N','NA'])

# outcome: 3 -> NA
data['covid'] = data['covid'].replace([3],['NA'])

##Para target (inpatient): 1 -> 0, 2 -> 1
data['inpatient']= data['inpatient'].replace([1,2],[0,1])
data.head(5)

Unnamed: 0,sex,inpatient,pneumonia,age,pregnant,diabetes,copd,asthma,immunosuppression,hypertension,other_diseases,cardiovascular,obesity,chronic_kidney_failure,smoker,covid
0,M,0,N,42,,N,N,Y,N,N,N,N,N,N,N,Y
1,F,0,N,51,N,N,N,N,N,N,N,N,N,N,N,Y
2,M,1,N,51,,Y,N,N,N,Y,N,N,Y,N,N,Y
3,M,1,N,57,,Y,N,N,N,N,N,N,N,N,N,Y
4,F,1,N,44,N,Y,N,N,N,N,N,N,N,N,N,N


In [9]:
data.groupby(["sex","pregnant"])["pregnant"].count()

sex  pregnant
F    N           45914
     Y             976
M    NA          48354
Name: pregnant, dtype: int64

In [10]:
#creo categorias para edad por décadas (puede ser por etapa biologica niños-jovenes-adultos-ancianos)
data = data.assign(agegroup=pd.cut(data['age'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100], right=False, labels=["0-9","10-19","20-29","30-39","40-49","50-59","60-69","70-79","80-89","90-99"]))

In [11]:
data.columns

Index(['sex', 'inpatient', 'pneumonia', 'age', 'pregnant', 'diabetes', 'copd',
       'asthma', 'immunosuppression', 'hypertension', 'other_diseases',
       'cardiovascular', 'obesity', 'chronic_kidney_failure', 'smoker',
       'covid', 'agegroup'],
      dtype='object')

In [12]:
#elimino age para no tener dummies
data.drop(["age"], axis=1, inplace=True)

In [13]:
data.columns

Index(['sex', 'inpatient', 'pneumonia', 'pregnant', 'diabetes', 'copd',
       'asthma', 'immunosuppression', 'hypertension', 'other_diseases',
       'cardiovascular', 'obesity', 'chronic_kidney_failure', 'smoker',
       'covid', 'agegroup'],
      dtype='object')

In [14]:
#creo variables dummies de todas las columnas menos la target
data = pd.get_dummies(data=data, columns= data.columns.difference(["inpatient"]),drop_first = True)

In [15]:
data.columns

Index(['inpatient', 'agegroup_10-19', 'agegroup_20-29', 'agegroup_30-39',
       'agegroup_40-49', 'agegroup_50-59', 'agegroup_60-69', 'agegroup_70-79',
       'agegroup_80-89', 'agegroup_90-99', 'asthma_Y', 'cardiovascular_Y',
       'chronic_kidney_failure_N', 'chronic_kidney_failure_Y', 'copd_Y',
       'covid_NA', 'covid_Y', 'diabetes_N', 'diabetes_Y', 'hypertension_Y',
       'immunosuppression_Y', 'obesity_Y', 'other_diseases_N',
       'other_diseases_Y', 'pneumonia_Y', 'pregnant_NA', 'pregnant_Y', 'sex_M',
       'smoker_Y'],
      dtype='object')

## Creación variables X e Y

Eliminamos las columnas death_date, intubated y icu (por su correlación lineal con intubated)

In [16]:
x=data.drop(["inpatient"],axis=1)
y=data["inpatient"]
y=pd.DataFrame(y)
y

Unnamed: 0,inpatient
0,0
1,0
2,1
3,1
4,1
...,...
95247,1
95248,1
95249,1
95250,0


In [17]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1)
print ("X_train:" , x_train.shape[0])
print ("y_train:" , y_train.shape[0])
print ("X_test:" , x_test.shape[0])
print ("y_test:" , y_test.shape[0])

X_train: 76195
y_train: 76195
X_test: 19049
y_test: 19049


In [18]:
y_train=pd.DataFrame(y_train)
y_train

Unnamed: 0,inpatient
17690,0
34279,0
32502,1
41594,1
11450,0
...,...
21447,1
73357,1
50065,0
5197,0


In [19]:
#y_train=y_train.astype('float')
#y_test=y_test.astype('float')

## GridSearch Para KNN

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [24]:
knn=KNeighborsClassifier()
k_range = list(range(1, 21))
param_grid = dict(n_neighbors=k_range)
print(param_grid)

{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]}


In [25]:
folds=StratifiedKFold(n_splits=10, random_state=19, shuffle=True)

In [26]:
grid = GridSearchCV(knn, param_grid, cv=folds, scoring='accuracy')

In [28]:
#esto tarda una hora más o menos en correr
grid.fit(x_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=19, shuffle=True),
             estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20]},
             scoring='accuracy')

In [29]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,6.58051,0.497899,7.622878,0.340655,1,{'n_neighbors': 1},0.834908,0.8,0.809711,0.787008,0.808399,0.771099,0.803518,0.803518,0.766767,0.830686,0.801561,0.021057,20
1,10.640939,6.227204,10.651514,3.805309,2,{'n_neighbors': 2},0.84252,0.821654,0.825984,0.829003,0.808793,0.836855,0.82688,0.83738,0.831211,0.836068,0.829635,0.009173,19
2,19.540323,0.99312,16.706692,0.387092,3,{'n_neighbors': 3},0.855118,0.831627,0.83937,0.851969,0.821391,0.849718,0.859037,0.859168,0.853655,0.842368,0.846342,0.011844,18
3,19.899997,0.787663,16.919122,0.18048,4,{'n_neighbors': 4},0.859449,0.854199,0.850656,0.854331,0.856693,0.858118,0.86048,0.855624,0.85418,0.846437,0.855017,0.003959,17
4,19.25669,2.574614,17.21291,0.719417,5,{'n_neighbors': 5},0.865223,0.858661,0.859055,0.859318,0.857218,0.864155,0.866912,0.85838,0.859824,0.851949,0.86007,0.004131,16
5,20.698802,0.613463,18.223562,0.366608,6,{'n_neighbors': 6},0.866798,0.860892,0.857874,0.860761,0.861155,0.865205,0.868487,0.864418,0.863237,0.863368,0.863219,0.003004,15
6,20.546808,3.599424,18.594004,1.929485,7,{'n_neighbors': 7},0.86916,0.863123,0.857743,0.865486,0.864961,0.867962,0.869274,0.859824,0.866649,0.858905,0.864309,0.004033,14
7,20.040434,3.711347,17.806604,2.427368,8,{'n_neighbors': 8},0.869423,0.863123,0.864304,0.864304,0.867979,0.867305,0.868618,0.862055,0.86783,0.868749,0.866369,0.002516,13
8,25.165586,1.690103,21.833525,2.18257,9,{'n_neighbors': 9},0.868898,0.862598,0.865354,0.865354,0.868504,0.871374,0.870324,0.861268,0.868224,0.864812,0.866671,0.003145,12
9,21.683385,1.54847,19.47497,2.377699,10,{'n_neighbors': 10},0.870079,0.865092,0.866667,0.86811,0.867192,0.867305,0.871637,0.861268,0.867437,0.868224,0.867301,0.002641,11


In [30]:
grid.best_estimator_

KNeighborsClassifier(n_neighbors=19)

In [31]:
grid.best_score_

0.8700702898093902

In [32]:
grid.best_params_

{'n_neighbors': 19}

In [33]:
y_pred_grid = grid.predict(x_test)

In [None]:
confusion = confusion_matrix(y_test, y_pred_grid)

In [None]:
confusion

In [None]:
print (classification_report(y_test, y_pred_grid))

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
def busquedaGridsearch(params_):
    folds=StratifiedKFold(n_splits=10, random_state=19, shuffle=True)
    gs = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=params_, scoring='accuracy', cv=folds, n_jobs=4)
    fit = gs.fit(X_train, y_train)
    return gs    

In [None]:
def busquedaRandomSearch(params_,iter_):
    folds=StratifiedKFold(n_splits=10, random_state=19, shuffle=True)
    gs = RandomizedSearchCV(estimator=KNeighborsClassifier(), param_distributions=params_, scoring='accuracy', cv=folds, n_jobs=4, n_iter = iter_ )
    fit = gs.fit(X_train, y_train)
    return gs    

In [None]:
param_dist = {
    'n_neighbors': range(1,200),
    'weights' : ['uniform', 'distance'],
    'p' : [1, 2, 3]
}


In [None]:
from sklearn.metrics import accuracy_score

def obtener_performance(estimator):
    y_pred = estimator.predict(X_test)
    return accuracy_score(y_pred,y_test, normalize = True)

In [None]:
obtener_performance(gs_grid_search.best_estimator_)

In [None]:
obtener_performance(gs_random_search.best_estimator_)

## Machine Learning Algorithms

In [None]:
#Naive Bayes
nb=MultinomialNB()
nb.fit(x_train, y_train)
score_nb=nb.score(x_test,y_test)

#KNN (poner en n_neighbors el best_params_ de GridSearch)
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train,y_train)
score_knn=knn.score(x_test,y_test)

#Logistic regression
scaler = StandardScaler()
x_trainstd = scaler.fit_transform(x_train)
x_teststd = scaler.transform(x_test)
lr = LogisticRegression()
lr.fit(x_trainstd, y_train)
score_lr= lr.score(x_teststd, y_test)

#Score
print ("Naive Bayes Acc Score:",score_nb)
print ("KNN Acc Score:",score_knn)
print ("Logistic Regression Score:", score_lr)

In [None]:
y_pred_nb = nb.predict(x_test)
y_pred_knn = knn.predict(x_test)
y_pred_lr = lr.predict(x_teststd)

In [None]:
print("AS NB:\n", accuracy_score(y_test, y_pred_nb))
print("AS KNN:\n", accuracy_score(y_test, y_pred_knn))
print("AS LR:\n", accuracy_score(y_test, y_pred_lr))

In [None]:
#usar cross validation Stratified K-fold porque el target está desbalanceado
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score,StratifiedKFold
from sklearn.linear_model import LogisticRegression
iris=load_iris()
X=iris.data
Y=iris.target
logreg=LogisticRegression()
stratifiedkf=StratifiedKFold(n_splits=5)
score=cross_val_score(logreg,X,Y,cv=stratifiedkf)
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

## Confusion_matrix

In [None]:
print("CM NB:\n", confusion_matrix(y_test, y_pred_nb))
print("CM KNN:\n", confusion_matrix(y_test, y_pred_knn))
print("CM LR:\n", confusion_matrix(y_test, y_pred_lr))

## Recall,Precision and F-1 Score


* Recall    =  TP / (TP + FN)
* Precision = TP / (TP + FP) 
* F-Measure = (2 * Precision * Recall) / (Precision + Recall)

In [None]:
print("NB:\n", classification_report(y_test,y_pred_nb))
print("KNN:\n", classification_report(y_test,y_pred_knn))
print("LR:\n", classification_report(y_test,y_pred_lr))

In [None]:
print ("F1 Score NB:", f1_score(y_test,y_pred_nb))
print ("F1 Score KNN:", f1_score(y_test,y_pred_knn))
print ("F1 Score LR:", f1_score(y_test,y_pred_lr))

## roc & auc

In [None]:
#NB
probs_nb=nb.predict_proba(x_test)
preds_nb=probs_nb[:,1]
fpr_nb,tpr_nb, threshold_nb =metrics.roc_curve(y_test, y_pred_nb)
roc_auc_nb=metrics.auc(fpr_nb,tpr_nb)

#KNN
probs_knn=knn.predict_proba(x_test)
preds_knn=probs_knn[:,1]
fpr_knn,tpr_knn, threshold_knn =metrics.roc_curve(y_test, y_pred_knn)
roc_auc_knn=metrics.auc(fpr_knn,tpr_knn)

#LR
probs_lr=lr.predict_proba(x_teststd)
preds_lr=probs_lr[:,1]
fpr_lr,tpr_lr, threshold_lr =metrics.roc_curve(y_test, y_pred_lr)
roc_auc_lr=metrics.auc(fpr_lr,tpr_lr)

In [None]:
plt.title("ROC")

plt.plot(fpr_nb, tpr_nb, 'g', label="NB" % roc_auc_nb)
plt.plot(fpr_knn, tpr_knn, 'r', label="KNN" % roc_auc_nb)
plt.plot(fpr_lr, tpr_lr, 'b', label="LR" % roc_auc_nb)

plt.legend(loc="lower right")
plt.plot([0,1],[0,1],'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Pozitive Rate')
plt.xlabel('False Positive Rate')
plt.show()