In [None]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import matplotlib
from sklearn.metrics import confusion_matrix

plt.style.use('ggplot')

## Import training and testing dataset

In [173]:
train = pd.read_csv('./dataset_diabetes/new_train.csv', index_col='encounter_id')
test = pd.read_csv('./dataset_diabetes/new_test.csv', index_col='encounter_id')

In [175]:
train.drop('Unnamed: 0', axis=1, inplace=True)
test.drop('Unnamed: 0', axis=1, inplace=True)

In [176]:
pd.options.display.max_columns = 55

In [177]:
train.readmitted_Yes.shape

(80084,)

## Split into X and y train and test

In [178]:
Xtrain = train.drop('readmitted_Yes', axis=1)
ytrain = train['readmitted_Yes']

In [179]:
Xtest = test.drop('readmitted_Yes', axis=1)
ytest = test['readmitted_Yes']

In [180]:
X = Xtrain.append(Xtest)
y = ytrain.append(ytest)

## Model

In [202]:
from sklearn.linear_model import LogisticRegression
import sklearn.model_selection as ms

train_error =[]
test_error = []
s = []
logit = LogisticRegression()
for i in range(5):
    x_train, x_test, y_train, y_test = ms.train_test_split(Xtrain, ytrain, test_size=0.2, random_state=i)
    logit.fit(x_train, y_train)
    train_error.append(1 - logit.score(x_train, y_train))
    test_error.append(1 - logit.score(x_test, y_test))
    s.append(logit.score(x_train, y_train))

In [244]:
logit = LogisticRegression()
ms_k5 = ms.KFold(n_splits=5, shuffle=True)
trainscores = []
testscores = []
act_testscores = []
y_predicts = []
y_probs = []
for train_idx, test_idx in ms_k5.split(Xtrain, ytrain):
    x_train = Xtrain.iloc[train_idx]
    y_train = ytrain.iloc[train_idx]
    x_test = Xtrain.iloc[test_idx]
    y_test = ytrain.iloc[test_idx]
    logit.fit(x_train, y_train)
    trainscores.append(logit.score(x_train, y_train))
    testscores.append(logit.score(x_test, y_test))
    act_testscores.append(logit.score(Xtest, ytest))
    
    # predict for test data set
    y_predicts.append(logit.predict(Xtest))
    y_probs.append(logit.predict_proba(Xtest))

In [231]:
trainscores

[0.8868372172881515,
 0.8863689574976197,
 0.886259696879829,
 0.8867903913090983,
 0.8863551226821502]

In [232]:
testscores

[0.8848723231566461,
 0.8871199350689892,
 0.8881188736966973,
 0.8854966597989636,
 0.8866758241758241]

In [233]:
act_testscores

[0.8863704443334998,
 0.8864203694458312,
 0.8863205192211683,
 0.8863704443334998,
 0.8862206689965052]

In [None]:
for y_predict in y_predicts:
    print(confusion_matrix(ytest, y_predict))

In [245]:
for y_prob in y_probs:
    print(roc_auc_score(ytest, y_prob[:,1]))

0.6645438415396275
0.6662952373563468
0.6648991292958074
0.6655352453810052
0.6650979605133946


## Logistic Regression Model with Gradient Descent

In [64]:
from sklearn.linear_model import SGDClassifier

In [72]:
sgd = SGDClassifier(loss='log', max_iter=1e4, l1_ratio=0, alpha=1e-4, shuffle=True)

In [246]:
ms_k5 = ms.KFold(n_splits=5, shuffle=True)
sdg_tr_scores = []
sdg_te_scores = []
sdg_act_te = []
sdg_y_predicts = []
sdg_y_probs = []
for train_idx, test_idx in ms_k5.split(Xtrain, ytrain):
    x_train = Xtrain.iloc[train_idx]
    y_train = ytrain.iloc[train_idx]
    x_test = Xtrain.iloc[test_idx]
    y_test = ytrain.iloc[test_idx]
    sgd.fit(x_train, y_train)
    sdg_tr_scores.append(sgd.score(x_train, y_train))
    sdg_te_scores.append(sgd.score(x_test, y_test))
    sdg_act_te.append(sgd.score(Xtest, ytest))
    
    # predict for test data set
    sdg_y_predicts.append(sgd.predict(Xtest))
    sdg_y_probs.append(sgd.predict_proba(Xtest))

In [247]:
sdg_tr_scores

[0.8860099583248786,
 0.8854168292568717,
 0.8864782181154104,
 0.8863845661573041,
 0.8870887182368733]

In [248]:
sdg_te_scores

[0.8873072360616845,
 0.8893675469813324,
 0.887182368733221,
 0.8841855528500968,
 0.8834915084915085]

In [249]:
sdg_act_te

[0.8861208187718422,
 0.8860209685471793,
 0.8862705941088368,
 0.8856714927608587,
 0.8863205192211683]

In [250]:
for y_probs in sdg_y_probs:
    print(roc_auc_score(ytest, y_prob[:,1]))

0.6650979605133946
0.6650979605133946
0.6650979605133946
0.6650979605133946
0.6650979605133946


## Logistic Regression with Selected Features from Random Forest

In [219]:
import pickle

In [220]:
sel_features = pickle.load(open('./pickled/feaure_list_dt.list', 'rb'))

In [223]:
newXtrain = Xtrain[sel_features]
newXtest = Xtest[sel_features]

In [241]:
new_logit = LogisticRegression()
ms_k5 = ms.KFold(n_splits=5, shuffle=True)
new_trainscores = []
new_testscores = []
new_acttestscores = []
new_y_predicts = []
new_y_predicts_probs = []
for train_idx, test_idx in ms_k5.split(newXtrain, ytrain):
    x_train = newXtrain.iloc[train_idx]
    y_train = ytrain.iloc[train_idx]
    x_test = newXtrain.iloc[test_idx]
    y_test = ytrain.iloc[test_idx]
    new_logit.fit(x_train, y_train)
    new_trainscores.append(new_logit.score(x_train, y_train))
    new_testscores.append(new_logit.score(x_test, y_test))
    new_acttestscores.append(new_logit.score(newXtest, ytest))
    
    # predict for test data set
    new_y_predicts.append(new_logit.predict(newXtest))
    new_y_predicts_probs.append(new_logit.predict_proba(newXtest))

In [236]:
new_trainscores

[0.886899651926889,
 0.8858694803877191,
 0.885229525340659,
 0.888007866764481,
 0.8860897796091652]

In [237]:
new_testscores

[0.8844977211712556,
 0.8885559093463196,
 0.8913654242367485,
 0.8803146656677281,
 0.8876748251748252]

In [238]:
new_acttestscores

[0.8863704443334998,
 0.8864702945581627,
 0.8861208187718422,
 0.8863205192211683,
 0.8864702945581627]

In [243]:
for y_predict_prob in new_y_predicts_probs:
    print(roc_auc_score(ytest, y_predict_prob[:,1]))

0.6610210899198871
0.6607329222667135
0.6619486218050974
0.6631482541743832
0.6628049920000127


In [251]:
for y_predict in new_y_predicts:
    print(confusion_matrix(ytest, y_predict))

[[17719    40]
 [ 2238    33]]
[[17719    40]
 [ 2234    37]]
[[17717    42]
 [ 2236    35]]
[[17710    49]
 [ 2227    44]]
[[17718    41]
 [ 2232    39]]
