In [1]:
import numpy as np
import pandas as pd
import random as random
import cPickle as pickle
import re
import matplotlib.pyplot as plt
%matplotlib inline

from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from scipy.sparse import csr_matrix


In [2]:
data = pd.read_csv('mimic/NOTES_DISCHARGE_SUMMARY_REPORT.csv',sep=',')
data.head(10)

Unnamed: 0,subject_id,hadm_id,notes
0,17,161087,Admission Date: [**2135-5-9**] D...
1,17,194023,Admission Date: [**2134-12-27**] ...
2,23,124321,Admission Date: [**2157-10-18**] ...
3,23,152223,Admission Date: [**2153-9-3**] D...
4,34,144319,Admission Date: [**2191-2-23**] ...
5,36,122659,Admission Date: [**2131-5-12**] ...
6,36,165660,Admission Date: [**2134-5-10**] ...
7,36,182104,Admission Date: [**2131-4-30**] ...
8,68,108329,Admission Date: [**2174-1-4**] D...
9,68,170467,Admission Date: [**2173-12-15**] ...


In [3]:
# text between [** **] is date or useless information
# eliminate it before topic modeling
def eliminateDate(s):
    l = map(eliminateStar,re.split('\[|\]',s))
    str = ''
    return str.join(l)

def eliminateStar(s):
    if len(s)>1 and s[0:2] == '**' and s[len(s)-2:] == '**':
        return ''
    else:
        return s

data.notes = map(eliminateDate,data.notes)
data.head(10)

Unnamed: 0,subject_id,hadm_id,notes
0,17,161087,Admission Date: Discharge Date:...
1,17,194023,Admission Date: Discharge Date:...
2,23,124321,Admission Date: Discharge Date:...
3,23,152223,Admission Date: Discharge Date:...
4,34,144319,Admission Date: Discharge Date:...
5,36,122659,Admission Date: Discharge Date:...
6,36,165660,Admission Date: Discharge Date:...
7,36,182104,Admission Date: Discharge Date:...
8,68,108329,Admission Date: Discharge Date:...
9,68,170467,Admission Date: Discharge Date:...


In [4]:
n_samples = data.shape[0]
samples = data.notes[:n_samples]

In [5]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [14]:
# Use tf (raw term count) features for LDA.
n_features = 1000
n_topics = 200
n_top_words = 50

print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(samples)
print("done in %0.3fs." % (time() - t0))


Extracting tf features for LDA...
done in 42.068s.


In [15]:
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Fitting LDA models with tf features, n_samples=16535 and n_features=1000...
done in 476.913s.

Topics in LDA model:
Topic #0:
left status post blood times artery time disease chest coronary follow normal pressure started pulmonary dr care carotid right 10 100 tablet sodium sig 00 respiratory following discontinued cardiac white rate air drip po improved heparin room given tube sputum edema 30 extubated use mid showed vancomycin positive internal felt
Topic #1:
line catheter placement placed central tip right venous femoral removed internal vein place site procedure blood removal left time cath using used 10 daily new pressure use po given stable performed bleeding atrium tablet 30 normal dr continued cm previous phone need impression level prior received noted heparin high drip
Topic #2:
mass cancer biopsy cell right lesion ct lung disease lesions left dr scan treatment cm new resection lobe seen consistent tissue mri diagnosed underwent diagnosis showed cells large ca therapy upper ed

In [8]:
note_file = open('data/topic_model_'+ str(n_topics),'wr')
pickle.dump((tf_vectorizer,lda),note_file)
note_file.close()


In [9]:
topics = lda.transform(tf)

In [16]:
print("Extracting tf-idf features for NMF...")
n_features = 400

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(samples)
print("done in %0.3fs." % (time() - t0))

tfidf = csr_matrix(tfidf).toarray()
print tfidf.shape

Extracting tf-idf features for NMF...
done in 41.283s.
(16535, 400)


In [17]:
data['topics'] = list(topics)
data['tfidf'] = list(tfidf)

data[['subject_id','hadm_id','topics','tfidf']].head(10)

Unnamed: 0,subject_id,hadm_id,topics,tfidf
0,17,161087,"[6.49350649351e-05, 6.49350649351e-05, 6.49350...","[0.0, 0.0518171067629, 0.0, 0.0, 0.0, 0.0, 0.0..."
1,17,194023,"[6.41025641026e-05, 6.41025641026e-05, 6.41025...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.236914986213,..."
2,23,124321,"[2.64900662252e-05, 0.127997346957, 0.01576299...","[0.0, 0.0, 0.0, 0.0, 0.177505298038, 0.0, 0.0,..."
3,23,152223,"[5.52486187845e-05, 5.52486187845e-05, 5.52486...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.160003723101, 0.0,..."
4,34,144319,"[2.95857988166e-05, 0.0652929059682, 2.9585798...","[0.0, 0.0, 0.0, 0.0, 0.0301439881524, 0.0, 0.0..."
5,36,122659,"[2.5974025974e-05, 2.5974025974e-05, 2.5974025...","[0.0247462243313, 0.0266583425072, 0.0, 0.0549..."
6,36,165660,"[0.00726900923191, 0.0201788107699, 0.00481127...","[0.0, 0.0221658366338, 0.0422658435003, 0.0914..."
7,36,182104,"[1.95694716243e-05, 0.0359782472654, 1.9569471...","[0.0, 0.0, 0.0177238367186, 0.0, 0.0, 0.0, 0.0..."
8,68,108329,"[4.44444444445e-05, 0.246976607057, 0.00873314...","[0.0, 0.0, 0.0438295819949, 0.0, 0.0, 0.0, 0.0..."
9,68,170467,"[1.85185185185e-05, 0.22560982218, 1.851851851...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [18]:
def roundTo(l,digit=4):
    return map(lambda x:np.round(x,digit),l)

data['topics'] = map(lambda l:roundTo(l,6),data['topics'])
data[['subject_id','hadm_id','topics','tfidf']].head(10)

Unnamed: 0,subject_id,hadm_id,topics,tfidf
0,17,161087,"[6.5e-05, 6.5e-05, 6.5e-05, 6.5e-05, 0.11635, ...","[0.0, 0.0518171067629, 0.0, 0.0, 0.0, 0.0, 0.0..."
1,17,194023,"[6.4e-05, 6.4e-05, 6.4e-05, 0.166471, 0.11499,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.236914986213,..."
2,23,124321,"[2.6e-05, 0.127997, 0.015763, 2.6e-05, 0.01063...","[0.0, 0.0, 0.0, 0.0, 0.177505298038, 0.0, 0.0,..."
3,23,152223,"[5.5e-05, 5.5e-05, 5.5e-05, 0.116725, 0.153052...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.160003723101, 0.0,..."
4,34,144319,"[3e-05, 0.065293, 3e-05, 3e-05, 0.136083, 3e-0...","[0.0, 0.0, 0.0, 0.0, 0.0301439881524, 0.0, 0.0..."
5,36,122659,"[2.6e-05, 2.6e-05, 2.6e-05, 0.008467, 2.6e-05,...","[0.0247462243313, 0.0266583425072, 0.0, 0.0549..."
6,36,165660,"[0.007269, 0.020179, 0.004811, 0.008065, 0.043...","[0.0, 0.0221658366338, 0.0422658435003, 0.0914..."
7,36,182104,"[2e-05, 0.035978, 2e-05, 2e-05, 0.068407, 2e-0...","[0.0, 0.0, 0.0177238367186, 0.0, 0.0, 0.0, 0.0..."
8,68,108329,"[4.4e-05, 0.246977, 0.008733, 4.4e-05, 4.4e-05...","[0.0, 0.0, 0.0438295819949, 0.0, 0.0, 0.0, 0.0..."
9,68,170467,"[1.9e-05, 0.22561, 1.9e-05, 1.9e-05, 1.9e-05, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [13]:
note_file = open('data/note_topic_' + str(n_topics),'wr')
pickle.dump(data[['subject_id','hadm_id','topics']],note_file)
note_file.close()

In [19]:
note_file = open('data/note_tfidf_' + str(n_features),'wr')
pickle.dump(data[['subject_id','hadm_id','tfidf']],note_file)
note_file.close()