In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from skopt import BayesSearchCV
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [4]:
df_adm_dis_sum=pd.read_csv('df_adm_dis_sum.csv')

In [5]:
# Split dataset into training set and test set
from sklearn.model_selection import train_test_split
df_train, df_test=train_test_split(df_adm_dis_sum,test_size=0.2, random_state=42)

In [6]:
# sub-sampling the negatives (non-readmitted) on the training set
df_train_readm=df_train[df_train.READMISSION_STATUS=='Readmitted']
df_train_non_readm=df_train[df_train.READMISSION_STATUS=='Non-readmitted']
df_train_sub = pd.concat([df_train_readm, df_train_non_readm.sample(n = len(df_train_readm), random_state = 42)],axis = 0)

In [7]:
X_train=df_train_sub['TEXT_AGG']
y_train=df_train_sub['READMISSION_STATUS']
X_test=df_test['TEXT_AGG']
y_test=df_test['READMISSION_STATUS']

# Part 5: Modeling-Continue

## BoW-TF-IDF with ML models

### For this section, I'm testing on the combination of TF-IDF with 5 machine learning models, including logistic regression, decision tree, linear svc, KNN and naive bayes. I'm using a combination of cross validation and BayesSearchCV on the training set to find the best hyper-parameters to maximize the ROC-AUC score. 

In [45]:
model_TF_IDF=[]
Roc_auc_cv=[]
Roc_auc_test=[]

In [8]:
# TF-IDF vs Logistic regression
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

param= dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scoring='roc_auc'

logistic_clf_tfidf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', BayesSearchCV(estimator=LogisticRegression(), search_spaces=param, scoring=scoring, n_jobs=-1, cv=cv))])

In [9]:
logistic_clf_tfidf.fit(X_train,y_train)



Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf',
                 BayesSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=1),
                               estimator=LogisticRegression(), n_jobs=-1,
                               scoring='roc_auc',
                               search_spaces={'C': [100, 10, 1.0, 0.1, 0.01],
                                              'penalty': ['l2'],
                                              'solver': ['newton-cg', 'lbfgs',
                                                         'liblinear']}))])

In [15]:
logistic_clf_tfidf_best=logistic_clf_tfidf.named_steps['clf'].best_score_

In [16]:
logistic_clf_tfidf_best

0.7355901096508857

In [11]:
y_prob_logistic_clf_tfidf = logistic_clf_tfidf.predict_proba(X_test)
roc_auc_logistic_clf_tfidf=roc_auc_score(y_test, y_prob_logistic_clf_tfidf[:,1])

In [12]:
roc_auc_logistic_clf_tfidf

0.7471870348323789

In [46]:
model_TF_IDF.append('Logistic Regression')
Roc_auc_cv.append(logistic_clf_tfidf_best)
Roc_auc_test.append(roc_auc_logistic_clf_tfidf)

In [13]:
# TF-IDF vs Naive Bayes
nb_clf_tfidf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB()),])

In [17]:
from sklearn.model_selection import cross_val_score
nb_clf_tfidf_best=(cross_val_score(nb_clf_tfidf, X_train, y_train, scoring="roc_auc", cv = 10)).mean()

In [18]:
nb_clf_tfidf_best

0.6968579085567572

In [22]:
nb_clf_tfidf.fit(X_train, y_train)
y_prob_nb_clf_tfidf=nb_clf_tfidf.predict_proba(X_test)
roc_auc_nb_clf_tfidf=roc_auc_score(y_test, y_prob_nb_clf_tfidf[:,1])

In [23]:
roc_auc_nb_clf_tfidf

0.7173202345268289

In [47]:
model_TF_IDF.append('Naive Bayes')
Roc_auc_cv.append(nb_clf_tfidf_best)
Roc_auc_test.append(roc_auc_nb_clf_tfidf)

In [24]:
# TF-IDF vs Decision Tree
from sklearn.tree import DecisionTreeClassifier

max_depth=[2, 3, 5, 10, 20]
min_samples_leaf=[5, 10, 20, 50, 100]
criterion=["gini", "entropy"]

param= dict(max_depth=max_depth,min_samples_leaf=min_samples_leaf,criterion=criterion)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scoring='roc_auc'

dec_tree_clf_tfidf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', BayesSearchCV(estimator=DecisionTreeClassifier(), search_spaces=param, scoring=scoring, n_jobs=-1, cv=cv))])

In [25]:
dec_tree_clf_tfidf.fit(X_train,y_train)



Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf',
                 BayesSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=1),
                               estimator=DecisionTreeClassifier(), n_jobs=-1,
                               scoring='roc_auc',
                               search_spaces={'criterion': ['gini', 'entropy'],
                                              'max_depth': [2, 3, 5, 10, 20],
                                              'min_samples_leaf': [5, 10, 20,
                                                                   50,
                                                                   100]}))])

In [26]:
dec_tree_clf_tfidf_best=dec_tree_clf_tfidf.named_steps['clf'].best_score_

In [27]:
dec_tree_clf_tfidf_best

0.6486519728338949

In [28]:
y_prob_dec_tree_clf_tfidf = dec_tree_clf_tfidf.predict_proba(X_test)
roc_auc_dec_tree_clf_tfidf=roc_auc_score(y_test, y_prob_dec_tree_clf_tfidf[:,1])

In [30]:
roc_auc_dec_tree_clf_tfidf

0.6519165886322249

In [48]:
model_TF_IDF.append('Decision Tree')
Roc_auc_cv.append(dec_tree_clf_tfidf_best)
Roc_auc_test.append(roc_auc_dec_tree_clf_tfidf)

In [31]:
# TF-IDF vs Linear SVM
c_values=[100, 10, 1.0, 0.1, 0.01]

param= dict(C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scoring='roc_auc'

lsvc_clf_tfidf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', BayesSearchCV(estimator=LinearSVC(), search_spaces=param, scoring=scoring, n_jobs=-1, cv=cv))])

In [32]:
lsvc_clf_tfidf.fit(X_train,y_train)



Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf',
                 BayesSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=1),
                               estimator=LinearSVC(), n_jobs=-1,
                               scoring='roc_auc',
                               search_spaces={'C': [100, 10, 1.0, 0.1,
                                                    0.01]}))])

In [33]:
lsvc_clf_tfidf_best=lsvc_clf_tfidf.named_steps['clf'].best_score_

In [34]:
lsvc_clf_tfidf_best

0.7349436916384661

In [35]:
y_dec_func_lsvc_clf_tfidf=lsvc_clf_tfidf.decision_function(X_test)
roc_auc_lsvc_clf_tfidf=roc_auc_score(y_test, y_dec_func_lsvc_clf_tfidf)

In [37]:
roc_auc_lsvc_clf_tfidf

0.7471360575588317

In [49]:
model_TF_IDF.append('Linear SVC')
Roc_auc_cv.append(lsvc_clf_tfidf_best)
Roc_auc_test.append(roc_auc_lsvc_clf_tfidf)

In [38]:
# TF-IDF vs KNN
k_range = list(range(1, 31))
param= dict(n_neighbors=k_range)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scoring='roc_auc'

knn_clf_tfidf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', BayesSearchCV(estimator=KNeighborsClassifier(), search_spaces=param, scoring=scoring, n_jobs=-1, cv=cv))])

In [39]:
knn_clf_tfidf.fit(X_train, y_train)



Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf',
                 BayesSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=1),
                               estimator=KNeighborsClassifier(), n_jobs=-1,
                               scoring='roc_auc',
                               search_spaces={'n_neighbors': [1, 2, 3, 4, 5, 6,
                                                              7, 8, 9, 10, 11,
                                                              12, 13, 14, 15,
                                                              16, 17, 18, 19,
                                                              20, 21, 22, 23,
                                                              24, 25, 26, 27,
                                                              28, 29, 30]}))])

In [40]:
knn_clf_tfidf_best=knn_clf_tfidf.named_steps['clf'].best_score_

In [41]:
knn_clf_tfidf_best

0.6726982933727277

In [42]:
y_prob_knn_clf_tfidf = knn_clf_tfidf.predict_proba(X_test)
roc_auc_knn_clf_tfidf=roc_auc_score(y_test, y_prob_knn_clf_tfidf[:,1])

In [43]:
roc_auc_knn_clf_tfidf

0.6823505111059872

In [50]:
model_TF_IDF.append('KNN')
Roc_auc_cv.append(knn_clf_tfidf_best)
Roc_auc_test.append(roc_auc_knn_clf_tfidf)

In [52]:
result_TF_IDF=pd.DataFrame({'model_TF_IDF': model_TF_IDF, 'Roc_auc_cross_val': Roc_auc_cv,'Roc_auc_test':Roc_auc_test})
result_TF_IDF=result_TF_IDF.sort_values('Roc_auc_test')
result_TF_IDF.reset_index(drop = True)

Unnamed: 0,model_TF_IDF,Roc_auc_cross_val,Roc_auc_test
0,Decision Tree,0.648652,0.651917
1,KNN,0.672698,0.682351
2,Naive Bayes,0.696858,0.71732
3,Linear SVC,0.734944,0.747136
4,Logistic Regression,0.73559,0.747187
