In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Part 5: Modeling

## Sub-sampling and set splitting

In [2]:
df_adm_dis_sum=pd.read_csv('df_adm_dis_sum.csv')

In [3]:
# Split dataset into training set and test set
from sklearn.model_selection import train_test_split
df_train, df_test=train_test_split(df_adm_dis_sum,test_size=0.2, random_state=42)

In [4]:
# sub-sampling the negatives (non-readmitted) on the training set
df_train_readm=df_train[df_train.READMISSION_STATUS=='Readmitted']
df_train_non_readm=df_train[df_train.READMISSION_STATUS=='Non-readmitted']
df_train_sub = pd.concat([df_train_readm, df_train_non_readm.sample(n = len(df_train_readm), random_state = 42)],axis = 0)

In [5]:
X_train=df_train_sub['TEXT_AGG']
y_train=df_train_sub['READMISSION_STATUS']
X_test=df_test['TEXT_AGG']
y_test=df_test['READMISSION_STATUS']

## BoW with ML models

### For this section, I'm testing on the combination of simple BoW with 5 machine learning models, including logistic regression, decision tree, linear svc, KNN and naive bayes. I'm using a combination of cross validation and BayesSearchCV on the training set to find the best hyper-parameters to maximize the ROC-AUC score. 

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from skopt import BayesSearchCV
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [7]:
model_BoW=[]
Roc_auc_cv_BoW=[]
Roc_auc_test_BoW=[]

In [8]:
# BoW vs Logistic regression
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

param= dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scoring='roc_auc'

logistic_clf_BoW = Pipeline([('BoW', CountVectorizer()),
                     ('clf', BayesSearchCV(estimator=LogisticRegression(), search_spaces=param, scoring=scoring, n_jobs=-1, cv=cv))])

In [9]:
logistic_clf_BoW.fit(X_train,y_train)



Pipeline(steps=[('BoW', CountVectorizer()),
                ('clf',
                 BayesSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=1),
                               estimator=LogisticRegression(), n_jobs=-1,
                               scoring='roc_auc',
                               search_spaces={'C': [100, 10, 1.0, 0.1, 0.01],
                                              'penalty': ['l2'],
                                              'solver': ['newton-cg', 'lbfgs',
                                                         'liblinear']}))])

In [10]:
logistic_clf_BoW_best=logistic_clf_BoW.named_steps['clf'].best_score_

In [11]:
logistic_clf_BoW_best

0.6938577840122846

In [13]:
y_prob_logistic_clf_BoW=logistic_clf_BoW.predict_proba(X_test)
logistic_clf_BoW_roc_auc=roc_auc_score(y_test, y_prob_logistic_clf_BoW[:,1])

In [19]:
logistic_clf_BoW_roc_auc

0.696960891804266

In [46]:
model_BoW.append('Logistic Regression')
Roc_auc_cv_BoW.append(logistic_clf_BoW_best)
Roc_auc_test_BoW.append(logistic_clf_BoW_roc_auc)

In [38]:
# BoW vs Naive Bayes
nb_clf_BoW = Pipeline([('BoW', CountVectorizer()),
                     ('clf', MultinomialNB()),])

In [40]:
from sklearn.model_selection import cross_val_score
nb_clf_BoW_best=(cross_val_score(nb_clf_BoW, X_train, y_train, scoring="roc_auc", cv = 10)).mean()

In [41]:
nb_clf_BoW_best

0.6776770592876539

In [None]:
model_BoW.append('Naive Bayes')
Roc_auc_BoW.append(nb_clf_BoW_best)

In [44]:
nb_clf_BoW.fit(X_train, y_train)
y_prob_nb_clf_BoW=nb_clf_BoW.predict_proba(X_test)
nb_clf_BoW_roc_auc=roc_auc_score(y_test, y_prob_nb_clf_BoW[:,1])

In [45]:
nb_clf_BoW_roc_auc

0.6950238534426914

In [47]:
model_BoW.append('Naive Bayes')
Roc_auc_cv_BoW.append(nb_clf_BoW_best)
Roc_auc_test_BoW.append(nb_clf_BoW_roc_auc)

In [25]:
# BoW vs Decision Tree
max_depth=[2, 3, 5, 10, 20]
min_samples_leaf=[5, 10, 20, 50, 100]
criterion=["gini", "entropy"]

param= dict(max_depth=max_depth,min_samples_leaf=min_samples_leaf,criterion=criterion)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scoring='roc_auc'

dec_tree_clf_BoW = Pipeline([('BoW', CountVectorizer()),
                     ('clf', BayesSearchCV(estimator=DecisionTreeClassifier(), search_spaces=param, scoring=scoring, n_jobs=-1, cv=cv))])

In [26]:
dec_tree_clf_BoW.fit(X_train,y_train)



Pipeline(steps=[('BoW', CountVectorizer()),
                ('clf',
                 BayesSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=1),
                               estimator=DecisionTreeClassifier(), n_jobs=-1,
                               scoring='roc_auc',
                               search_spaces={'criterion': ['gini', 'entropy'],
                                              'max_depth': [2, 3, 5, 10, 20],
                                              'min_samples_leaf': [5, 10, 20,
                                                                   50,
                                                                   100]}))])

In [27]:
dec_tree_clf_BoW_best=dec_tree_clf_BoW.named_steps['clf'].best_score_

In [28]:
dec_tree_clf_BoW_best

0.6490281103659524

In [30]:
y_prob_dec_tree_clf_BoW=dec_tree_clf_BoW.predict_proba(X_test)
dec_tree_clf_BoW_roc_auc=roc_auc_score(y_test, y_prob_dec_tree_clf_BoW[:,1])

In [31]:
dec_tree_clf_BoW_roc_auc

0.657790346943481

In [48]:
model_BoW.append('Decision Tree')
Roc_auc_cv_BoW.append(dec_tree_clf_BoW_best)
Roc_auc_test_BoW.append(dec_tree_clf_BoW_roc_auc)

In [15]:
# BoW vs Linear SVM
c_values=[100, 10, 1.0, 0.1, 0.01]

param= dict(C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scoring='roc_auc'

lsvc_clf_BoW = Pipeline([('BoW', CountVectorizer()),
                     ('clf', BayesSearchCV(estimator=LinearSVC(), search_spaces=param, scoring=scoring, n_jobs=-1, cv=cv))])

In [16]:
lsvc_clf_BoW.fit(X_train,y_train)



Pipeline(steps=[('BoW', CountVectorizer()),
                ('clf',
                 BayesSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=1),
                               estimator=LinearSVC(), n_jobs=-1,
                               scoring='roc_auc',
                               search_spaces={'C': [100, 10, 1.0, 0.1,
                                                    0.01]}))])

In [17]:
lsvc_clf_BoW_best=lsvc_clf_BoW.named_steps['clf'].best_score_

In [18]:
lsvc_clf_BoW_best

0.6585008473739113

In [22]:
y_des_lsvc_clf_BoW=lsvc_clf_BoW.decision_function(X_test)
lsvc_clf_BoW_roc_auc=roc_auc_score(y_test, y_des_lsvc_clf_BoW)

In [24]:
lsvc_clf_BoW_roc_auc

0.6624324158992626

In [49]:
model_BoW.append('Linear SVM')
Roc_auc_cv_BoW.append(lsvc_clf_BoW_best)
Roc_auc_test_BoW.append(lsvc_clf_BoW_roc_auc)

In [32]:
# BoW vs KNN
k_range = list(range(1, 31))
param= dict(n_neighbors=k_range)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scoring='roc_auc'

knn_clf_BoW = Pipeline([('BoW', CountVectorizer()),
                     ('clf', BayesSearchCV(estimator=KNeighborsClassifier(), search_spaces=param, scoring=scoring, n_jobs=-1, cv=cv))])

In [33]:
knn_clf_BoW.fit(X_train,y_train)



Pipeline(steps=[('BoW', CountVectorizer()),
                ('clf',
                 BayesSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=1),
                               estimator=KNeighborsClassifier(), n_jobs=-1,
                               scoring='roc_auc',
                               search_spaces={'n_neighbors': [1, 2, 3, 4, 5, 6,
                                                              7, 8, 9, 10, 11,
                                                              12, 13, 14, 15,
                                                              16, 17, 18, 19,
                                                              20, 21, 22, 23,
                                                              24, 25, 26, 27,
                                                              28, 29, 30]}))])

In [34]:
knn_clf_BoW_best=knn_clf_BoW.named_steps['clf'].best_score_

In [35]:
knn_clf_BoW_best

0.6697715586424492

In [36]:
y_knn_clf_BoW=knn_clf_BoW.predict_proba(X_test)
knn_clf_BoW_roc_auc=roc_auc_score(y_test, y_knn_clf_BoW[:,1])

In [37]:
knn_clf_BoW_roc_auc

0.6651928352618428

In [50]:
model_BoW.append('KNN')
Roc_auc_cv_BoW.append(knn_clf_BoW_best)
Roc_auc_test_BoW.append(knn_clf_BoW_roc_auc)

In [51]:
result_BoW=pd.DataFrame({'model_BoW': model_BoW, 'Roc_auc_cross_val': Roc_auc_cv_BoW,'Roc_auc_test':Roc_auc_test_BoW})
result_BoW=result_BoW.sort_values('Roc_auc_test')
result_BoW.reset_index(drop = True)

Unnamed: 0,model_BoW,Roc_auc_cross_val,Roc_auc_test
0,Decision Tree,0.649028,0.65779
1,Linear SVM,0.658501,0.662432
2,KNN,0.669772,0.665193
3,Naive Bayes,0.677677,0.695024
4,Logistic Regression,0.693858,0.696961
