<a href="https://colab.research.google.com/github/thotran2015/6.871/blob/master/ICU_Mortality_Skeleton.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import auth
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer

## Loading Data


In [0]:
auth.authenticate_user()

#### Lab Data

In [0]:
!gsutil cp gs://hst-956/adult_icu.gz ./


In [0]:
lab_df = pd.read_csv('adult_icu.gz')

#### Note Data

In [0]:
!gsutil cp gs://hst-956/adult_notes.gz ./


In [0]:
note_df = pd.read_csv('adult_notes.gz')

In [0]:
def get_best_model(model, parameters, X_val, y_val):
  clf = GridSearchCV(model, parameters)
  clf.fit(X_val, y_val)
  return clf.best_estimator_

## Question 2

Predicting hospital mortality from lab values



In [0]:
lab_df.head()

In [0]:
fea_note_df = lab_df.drop(columns=['subject_id',	'hadm_id','icustay_id', 'mort_hosp', 'mort_icu',	'mort_oneyr', 'train', 'test', 'valid'])
scaled_note_df = pd.DataFrame(preprocessing.scale(fea_note_df), columns = fea_note_df.columns)
scaled_note_df[['train', 'test', 'valid', 'mort_hosp']] = lab_df[['train', 'test', 'valid', 'mort_hosp']]

In [0]:
df_train = scaled_note_df[scaled_note_df['train'] ==1]
df_test = scaled_note_df[scaled_note_df['test'] ==1]
df_val = scaled_note_df[scaled_note_df['valid'] ==1]

X_train = df_train[fea_note_df.columns]
y_train = df_train['mort_hosp']
X_test = df_test[fea_note_df.columns]
y_test = df_test['mort_hosp']
X_val = df_val[fea_note_df.columns]
y_val = df_val['mort_hosp']

model = LogisticRegression(solver = 'liblinear')
parameters = {'C':[0.1,0.25,0.5,1.], 'penalty':['l1','l2']}

In [0]:
best_model = get_best_model(model, parameters, X_val, y_val)
best_model.fit(X_train, y_train)

In [0]:
test_acc = best_model.score(X_test,y_test)
print('Test accuracy: ', test_acc)

In [0]:
y_score = best_model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_score)
print('AUC score: ', auc)

In [0]:
y_pred = best_model.predict(X_test)
print('Confusion Matrix')
confusion_matrix(y_test, y_pred)

## Question 3

Predicting hospital mortality from clinical notes


In [0]:
note_df.head()

In [0]:
all_text = note_df['chartext']
note_train, note_test, note_val = note_df[note_df['train']==1], note_df[note_df['test']==1], note_df[note_df['valid']==1]

In [0]:
vectorizer = CountVectorizer()
vectorizer.fit(note_train['chartext'])

In [0]:
note_Xtrain, note_ytrain = vectorizer.transform(note_train['chartext']), note_train['mort_hosp']
note_Xtest, note_ytest = vectorizer.transform(note_test['chartext']), note_test['mort_hosp']
note_Xval, note_yval = vectorizer.transform(note_val['chartext']), note_val['mort_hosp']

In [0]:
model_note = LogisticRegression(solver = 'liblinear')
parameters = {'C':[0.1,0.25,0.5,1.], 'penalty':['l1','l2']}
best_model_note= get_best_model(model_note, parameters, note_Xval, note_yval)


In [0]:
best_model_note.fit(note_Xtrain, note_ytrain)

In [0]:
test_acc = best_model_note.score(note_Xtest, note_ytest)
print('Test accuracy: ', test_acc)

In [0]:
note_yscore = best_model_note.predict_proba(note_Xtest)[:, 1]
auc = roc_auc_score(note_ytest, note_yscore)
print('AUC score: ', auc)

In [0]:
y_pred = best_model_note.predict(X_test)
print('Confusion Matrix')
confusion_matrix(y_test, y_pred)

## Question 4

Analysis of data and results