# Preprocessing

Warning: this notebook takes a very, very long time to run.

In [1]:
import pandas as pd
import numpy as np
import spacy
import nltk
import re
import collections
from nltk import word_tokenize, sent_tokenize, text
from collections import Counter
pd.set_option("display.max_columns", None)
from sklearn import feature_extraction
import torch
import pickle

In [2]:
def preprocess(text):
    '''
    This function takes a string of text and does the following:
    - Remove initial "*" characters -- these are useless and extra noise
    - Convert alphabetic characters to lower case (Hello --> hello)
    - Replace numeric characters with "#" character.
    '''
    text = text.replace('*','').lower()
    text = re.sub('\d', '*', text)
    return text

def assign_numbers(vocab_list):
    '''
    This function assigns index numbers to words in a vocabulary.
    Note that "1" is reserved for unknown words, "0" for paddings.
    '''
    vocab_indexer = dict()
    index_value = 2
    for word in vocab_list:
        vocab_indexer[word] = index_value
        index_value = index_value+1
    return vocab_indexer

def create_vocabulary(counter, threshold):
    '''
    @counter is a Counter object listing the number of times each token appears in the corpus.
    @threshold = number of times a token must appear in corpus to be listed in vocabulary. 
    '''
    Vocabulary = []
    for key in counter:
        value = counter[key]
        if value>=threshold:
            Vocabulary.append(key)
    return assign_numbers(Vocabulary)

def token_to_index(tokens, token_indexer, list_format=False):
    """
    Function that transforms a list of tokens in a document and coverts it to a list of token indices.
    @param tokens: list of tokens from one document
    @param token_indexer: dictionary that maps each token in the vocabulary to an unique index
    @param not_list: if True, then items in token_indexer are Lists (relevant for byte encoder case). 
    
    """
    # Please DO NOT assign any ngram in the vocabulary to index 0
    document = []
    for token in tokens:
        if list_format == True:
            try:
                document.extend(token_indexer[token])
            except:
                document.append(1)

        elif list_format == False:
            try:  
                document.append(token_indexer[token])
            except:
                document.append(1)
    return document


def sentence_to_indexes(sentences, token_indexer, list_format=False):
    """
    Function that transforms a list of sentences of WORDS in a document and coverts it to a list of 
    sentences of INDEXES.
    
    @param sentences: list of sentences from one document
    @param token_indexer: dictionary that maps each token in the vocabulary to an unique index
    
    @param not_list: if True, then items in token_indexer are Lists (relevant for byte encoder case). 
    
    """
    # Please DO NOT assign any ngram in the vocabulary to index 0
    document = []
    for sentence in sentences:
        tokens = word_tokenize(sentence) #tokenize each individual sentence to unigrams
        representation = token_to_index(tokens, token_indexer, list_format=list_format)
        #Each sentence is now a list of indexed tokens. 
        document.append(representation)
    return document


#-------------------------- For Byte-Pair Encoding method

def split_tokens_into_characters(counter):
    '''This function takes a counter (indicating the counts per token across the entire corpus) and 
    splits each token into a series of characters'''
    char_dict = {}
    old_dict = dict(counter)
    for token in old_dict.keys():
        char_form = " ".join(token)+' </w>' #add ending character
        value = old_dict[token]
        char_dict[char_form] = value 
    return char_dict

def get_stats(vocab):
    '''This function counts the number of times each pair of characters appears next to each other.
    For example the sentence "baking a cake" would return (a,k):2, (b,a):1, (k,e):1, and so on.
    '''
    pairs = collections.defaultdict(int) 
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i],symbols[i+1]] += freq 
    return pairs

def merge_vocab(pair, v_in):
    '''This function changes the vocabulary so that n-grams that co-occur together frequently
    are merged'''
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)') 
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word] 
    return v_out

def convert_tokens_into_frequent_character_combinations(counter, num_merges, ongoing = False):
    '''This function takes a Counter object (counts per token) and converts it so that 
    the tokens (keys) become split into frequent character combinations.
    For example, {'cake': 3, 'baking': 1} might become {'c ak e': 3, 'b ak ing': 1}
    
    @num_merges is the number of "rounds" for merging character blocks. For each round, the 
    pair of characters that co-occur the most are combined. 
    
    @num_merges is a very important hyperparameter.
    
    @ongoing = False if you are starting from 0 merges. However, for efficiency reasons, it often
    makes sense to do 10,000 merges, then a subsequent 10,000 merges later on, rather than all 
    20,000 merges at one time. 
    '''
    if ongoing == False:
        char_dict = split_tokens_into_characters(counter)
    else:
        char_dict = counter.copy()
    
    for i in range(num_merges):
        if i % 1000 == 0:
            print(i)
        pairs = get_stats(char_dict)
        try:
            del pairs[('\\**', '\\')] #bugs
            del pairs[('*', '\\')]
        except:
            pass
        best = max(pairs, key=pairs.get) 
        char_dict = merge_vocab(best, char_dict)
    return char_dict

def create_vocabulary_of_character_combos(char_dict):
    '''Takes the character_dictionary from above and creates an indexed vocabulary 
    for frequent character combinations.
    
    For example, {'c ak e': 3, 'b ak ing': 1} would become something like
    {'c': 1, 'b': 2, 'ak': 3, 'e': 4, 'ing': 5}. 
    
    Eventually we want to map each unigram to a series of character blocks 
    (e.g. 'baking' becomes 2 3 5)
    '''
    #First, find the set of all grams
    grams = []
    for word in char_dict.keys():
        word_grams = word.split(' ')
        grams.extend(word_grams)
    GRAMS = list(np.unique(grams)) #Delete any duplicates
    Vocab_Index = assign_numbers(GRAMS)
    return Vocab_Index

def map_unigram_to_character_indices(char_dict, Vocab_Index):
    '''This function maps each unigram token to a series of numbers. Each number corresponds to a 
    character block.
    
    For example, if Vocab_Index = {'c': 1, 'b': 2, 'ak': 3, 'e': 4, 'ing': 5}, 
    "baking" should be mapped to [2, 3, 5]
    
    
    @char_dict is the counter-like object where the keys are unigrams split into character blocks, 
    values are the number of times the unigram appears.
    '''
    unigrams = {} 
    for x in char_dict.keys():
        unigram = x.replace(' ','').replace('</w>','') #e.g. revert "b ak ing<w>" to baking
        unigrams[unigram] = x
    
    for w in unigrams.keys():
        unigrams[w] = token_to_index(unigrams[w].split(' '), Vocab_Index)
    return unigrams


In [3]:
def save_as_pkl(vocab, filename):
    '''Function takes a file and saves it as a pkl file'''
    with open(filename, 'wb') as f:
        pickle.dump(vocab, f)

def load_pkl(filename):
    '''This function loads a pkl file'''
    with open(filename, 'rb') as f:
        vocab = pickle.load(f)
    return vocab

--------

# MIMIC II 

### We will only train the model on records that appear in MIMIC II, due to time and space constraints. 

### Determine which SUBJ_IDs should be in the training vs. test sets
Most studies with MIMIC text data have used the training/test split employed by Perrotte et al (2014) for MIMIC II. Please refer to https://physionet.org/works/ICD9CodingofDischargeSummaries/


Using the code provided in the link above (with some small edits), we found lists of Subject_IDs that belong to the training set and the test set. Note that the HADM_IDs in MIMIC II are completely different from the HADMIDs in MIMIC III, so we needed to use SUBJECT_ID as the primary key.  

In [4]:
path = './Perotte code/'

#Get list of SubjectIDs associated with test set
test_subj_ids = []
with open(path+'testing_SUBJ_IDs.data') as fin:
    for line in fin:
        test_subj_ids.append(line.strip('\n'))

#Get list of SubjectIDs associated with MIMIC II training set
train_subj_ids = []
with open(path+'training_SUBJ_IDs.data') as fin:
    for line in fin:
        train_subj_ids.append(line.strip('\n'))
train_subj_ids.remove('"subject_id"')

---------

## Create a dictionary that links each HADM_ID (hospital admission ID) to a list of ICD-9 Codes. 

#### NOTE: we will primarily consider the "rolled-up" ICD-9 codes. (Most codes are five digits but they can be "rolled" to a simpler three-digit code).



In [64]:
Diagnoses_ICD = pd.read_csv('../MIMIC-III/DIAGNOSES_ICD.csv')
Diagnoses_ICD = Diagnoses_ICD[-Diagnoses_ICD['ICD9_CODE'].isnull()]

#Only consider rows that appear in MIMIC II
Diagnoses_ICD = Diagnoses_ICD[(Diagnoses_ICD['SUBJECT_ID'].isin(train_subj_ids))|
                             (Diagnoses_ICD['SUBJECT_ID'].isin(test_subj_ids))]

Diagnoses_ICD['Rolled_ICD'] = np.where(Diagnoses_ICD['ICD9_CODE'].str[0] == 'E',
                                       Diagnoses_ICD['ICD9_CODE'].str[0:4],
                                       Diagnoses_ICD['ICD9_CODE'].str[0:3])
                                       
NumberCodes = len(Diagnoses_ICD['ICD9_CODE'].unique())
NumberRolled = len(Diagnoses_ICD['Rolled_ICD'].unique())

print('Unique ICD-9 codes:', NumberCodes, '\nUnique 3-Digit Codes:',NumberRolled, ' eventually reduced to 936 after eliminating codes that appear only in test set')
print('HADM_IDs', len(Diagnoses_ICD['HADM_ID'].unique()), '\nSUBJECT_IDs', 
                     len(Diagnoses_ICD['SUBJECT_ID'].unique()))

Diagnoses_ICD.head()

Unique ICD-9 codes: 5431 
Unique 3-Digit Codes: 970  eventually reduced to 936 after eliminating codes that appear only in test set
HADM_IDs 30050 
SUBJECT_IDs 21685


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,Rolled_ICD
0,1297,109,172335,1.0,40301,403
1,1298,109,172335,2.0,486,486
2,1299,109,172335,3.0,58281,582
3,1300,109,172335,4.0,5855,585
4,1301,109,172335,5.0,4254,425


In [6]:
#Dictionary where each key is a HADM_ID, each value is a list of rolled-up ICD9 codes
HADMID_Code_Dict = dict()
Unique_Visits = Diagnoses_ICD['HADM_ID'].unique()

for visit in Unique_Visits:
    VisitDF = Diagnoses_ICD[Diagnoses_ICD['HADM_ID']==visit].reset_index()
    ListOfICDs=[]
    for i in range(len(VisitDF)):
        ListOfICDs.append(VisitDF.loc[i, 'Rolled_ICD'])
    UniqueICDs = np.unique(ListOfICDs) #For rolled ICDs
    #ICDs = ' '.join(UniqueICDs)
    HADMID_Code_Dict[visit] = list(UniqueICDs)

HADMID_Code_Dict[100095]

['276',
 '285',
 '286',
 '403',
 '410',
 '414',
 '424',
 '428',
 '458',
 '486',
 '564',
 '585',
 '785']

---------

## Sidebar: Create 1-to-1 mapping between HADM_ID and SUBJECT_ID.  This will be useful in determining whether a record belongs in the training or test set

In [7]:
IDs = Diagnoses_ICD[['SUBJECT_ID', 'HADM_ID']].drop_duplicates()
IDs['Test'] = np.where(IDs['SUBJECT_ID'].isin(test_subj_ids), 1, 0)
IDs = IDs.sort_values(by='Test').reset_index(drop=True)
NumTrain = round(0.70*len(IDs))
IDs['Test'] = np.where((IDs.index > NumTrain)&(IDs['Test']==0), 0.5, IDs['Test'])
IDs = IDs.drop('SUBJECT_ID', 1)
IDs.groupby('Test').count()

Unnamed: 0_level_0,HADM_ID
Test,Unnamed: 1_level_1
0.0,21036
0.5,6138
1.0,2876


In [62]:
HADMID_Code_DF = pd.DataFrame(pd.Series(HADMID_Code_Dict)).\
                reset_index().rename(columns={0: 'ICD9_Codes','index': 'HADM_ID'})
    
HADMID_Code_DF['ICD9_Codes_str'] = HADMID_Code_DF['ICD9_Codes'].map(lambda x: ' '.join(x))
HADMID_Code_DF.head()

Unnamed: 0,HADM_ID,ICD9_Codes,ICD9_Codes_str
0,100006,"[203, 276, 309, 486, 493, 518, 785, V12, V15]",203 276 309 486 493 518 785 V12 V15
1,100007,"[401, 486, 557, 560, 997]",401 486 557 560 997
2,100009,"[250, 272, 278, 285, 401, 411, 414, 426, 440, ...",250 272 278 285 401 411 414 426 440 996 V15 V4...
3,100014,"[278, 300, 718, 726, 738, V45]",278 300 718 726 738 V45
4,100020,"[041, 276, 293, 337, 340, 344, 345, 369, 401, ...",041 276 293 337 340 344 345 369 401 428 530 56...


# --------

# Create a dictionary that links each HADM_ID to a Discharge Summary 

In [11]:
NoteEvents = pd.read_csv('../MIMIC-III/NOTEEVENTS.csv')
Notes = NoteEvents[NoteEvents['CATEGORY'] == 'Discharge summary'].reset_index(drop=True)
Notes = Notes[['SUBJECT_ID','HADM_ID','CHARTDATE','DESCRIPTION', 'TEXT']]
Notes = Notes[(Notes['SUBJECT_ID'].isin(train_subj_ids))|
                (Notes['SUBJECT_ID'].isin(test_subj_ids))]

#Dummy dataset
#Notes = Notes[Notes['HADM_ID']<100400]
Notes.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,SUBJECT_ID,HADM_ID,CHARTDATE,DESCRIPTION,TEXT
0,22532,167853.0,2151-08-04,Report,Admission Date: [**2151-7-16**] Dischar...
1,13702,107527.0,2118-06-14,Report,Admission Date: [**2118-6-2**] Discharg...
2,13702,167118.0,2119-05-25,Report,Admission Date: [**2119-5-4**] D...
3,13702,196489.0,2124-08-18,Report,Admission Date: [**2124-7-21**] ...
4,26880,135453.0,2162-03-25,Report,Admission Date: [**2162-3-3**] D...


### Concatenate discharage summaries for each HADM_ID. 
An HADM_ID might have one "Report" and several "Addendums". Try to make it so that each HADM_ID is associated with EXACTLY one block of text. In order to do so, we'll need to concatenate texts for each HADM_ID.

In [18]:
NotesPerID = Notes.groupby('HADM_ID').count()[['TEXT']]

#HADM_IDs_with_MultipleCounts: the HADM_IDs with more than one discharge summary.
HADM_IDs_with_MultipleCounts = NotesPerID[NotesPerID['TEXT'] > 1].index

#NotesToAppend: rows of text that need to be merged according to HADM_ID
NotesToAppend = Notes[Notes['HADM_ID'].isin(HADM_IDs_with_MultipleCounts)]

UnchangedNotes = Notes[-Notes['HADM_ID'].isin(HADM_IDs_with_MultipleCounts)]

ChangedNotes = pd.DataFrame()
for ID in HADM_IDs_with_MultipleCounts: #For each HADM_ID with multiple texts
    subdf = NotesToAppend[NotesToAppend['HADM_ID'] == ID] #create a smaller df with only that HADMID
    combined_text = subdf['TEXT'].str.cat() #combine all text in that column into one entry 
    subdf = subdf.drop_duplicates(subset='HADM_ID') #turn smaller df into 1-row df
    subdf['TEXT'] = combined_text 
    ChangedNotes = ChangedNotes.append(subdf)

TotalNotes = ChangedNotes.append(UnchangedNotes).reset_index(drop=True)
print(len(TotalNotes), len(TotalNotes['HADM_ID'].unique())) #now a 1-1 relationship.

29685 29685


### Merge with IDs column 

In [19]:
TotalNotes = pd.merge(IDs, TotalNotes, on='HADM_ID' )
TotalNotes = TotalNotes.sort_values(by=['Test', 'HADM_ID']) 

---------

## Preprocess the Summaries. 
## Then tokenize text into both I) words and II) sentences.

Preprocessing Steps
- Convert all alphabetical characters to lower case
- Eliminate any star(*) characters that do not correspond to numbers
- Convert numeric characters to star(*) -- so "2/2/2011" becomes "star/star/4stars"

In [20]:
TotalNotes['Tokens'] = TotalNotes['TEXT'].map(lambda x: word_tokenize(preprocess(x)))
TotalNotes['Sentences'] = TotalNotes['TEXT'].map(lambda x: sent_tokenize(preprocess(x)))
TotalNotes.head()

Unnamed: 0,HADM_ID,Test,SUBJECT_ID,CHARTDATE,DESCRIPTION,TEXT,Tokens,Sentences
565,100007,0.0,23018,2145-04-07,Report,Admission Date: [**2145-3-31**] ...,"[admission, date, :, [, ****-*-**, ], discharg...",[admission date: [****-*-**] dis...
18269,100009,0.0,533,2162-05-21,Report,Admission Date: [**2162-5-16**] ...,"[admission, date, :, [, ****-*-**, ], discharg...",[admission date: [****-*-**] dis...
15062,100031,0.0,6892,2140-11-24,Report,Admission Date: [**2140-11-11**] Discha...,"[admission, date, :, [, ****-**-**, ], dischar...",[admission date: [****-**-**] discharge...
1383,100038,0.0,21234,2127-07-13,Report,Admission Date: [**2127-7-11**] ...,"[admission, date, :, [, ****-*-**, ], discharg...",[admission date: [****-*-**] dis...
17412,100045,0.0,1569,2176-02-15,Report,Admission Date: [**2176-2-5**] D...,"[admission, date, :, [, ****-*-*, ], discharge...",[admission date: [****-*-*] disc...


--------------

# Fixed Vocabulary Methods 

### Count the number of times each token appears in the corpus

In [21]:
TotalNotes = TotalNotes.reset_index(drop=True)

Corpus_Counts = Counter()
for i in range(len(TotalNotes)): #for each discharge summary
    if TotalNotes.loc[i, 'Test'] != 1: #if document is NOT in test set
        tokens = TotalNotes['Tokens'][i]
        unigram_counts = Counter(tokens) #count number of unigrams
        Corpus_Counts.update(unigram_counts) #update the Counter for each discharge summary

### Restrict vocabulary to tokens that appear at least 5 times. 

In [22]:
vocabulary5 = create_vocabulary(Corpus_Counts, 5)
print(len(vocabulary5))

38119


### Out-Of-Vocabulary Words. One approach is to link each OOV word to its nearest word in the vocabulary, using edit distance.  

In [23]:
#Gather a list of OOV words
total_tokens = list(np.unique(np.concatenate(TotalNotes['Tokens'])))
OOVs5 = [x for x in total_tokens if x not in list(vocabulary5.keys())]
print(len(OOVs5))

131315


In [25]:
import Levenshtein #install and import Levenshtein package 
#Link each OOV to its closest vocabulary word
OOV_to_Vocab_Dict_5 = {}
i = 0
for token in OOVs5:
    i = i+1
    if i%20000==0:
        print(i)
    edit_dis_list = []
    for vocab in vocabulary5.keys():
        edit_dis = Levenshtein.distance(token, vocab)
        edit_dis_list.append((vocab, edit_dis))
    edit_dis_list = sorted(edit_dis_list, key=lambda tup: tup[1])
    closest_vocab = edit_dis_list[0][0]
    OOV_to_Vocab_Dict_5[token] = closest_vocab 

20000
40000
60000
80000
100000
120000


### Create an "expanded vocabulary" so that each OOV token is mapped to the same index as its closest vocabulary word.

In [26]:
extended_vocabulary5 = vocabulary5.copy()
for OOV, vocab_word in OOV_to_Vocab_Dict_5.items():
    extended_vocabulary5[OOV] = extended_vocabulary5[vocab_word]
    
print(extended_vocabulary5['edmematous'], extended_vocabulary5['edematous'])

save_as_pkl(extended_vocabulary5, 'ExtendedVocab5')

11218 11218


In [27]:
len(extended_vocabulary5)

169434

### Then convert each list of tokens (in the "Tokens" column) to a list of index numbers from the vocabulary -- e.g. "I like pie" becomes [34, 120, 17]  

In [30]:
#Regular Vocabulary - 5 tokens
HADMID_Text_Dict = {}
#for each discharge summary, convert tokens to indexes based on vocabulary
for i in range(len(TotalNotes)):
    tokens = TotalNotes['Tokens'][i]
    indexed_document = token_to_index(tokens, vocabulary5)
    hadm_id = TotalNotes.loc[i, 'HADM_ID']
    HADMID_Text_Dict[hadm_id] = indexed_document
    
#Same as directly above, except for the "Sentences" column
HADMID_TextSent_Dict = {}
for i in range(len(TotalNotes)): #for each discharge summary
    sentences = TotalNotes['Sentences'][i]
    indexed_document = sentence_to_indexes(sentences, vocabulary5, list_format=False)
    hadm_id = TotalNotes.loc[i, 'HADM_ID']
    HADMID_TextSent_Dict[hadm_id] = indexed_document

In [31]:
#Vocabulary with only Edit Distance
HADMID_Leven_Dict = {}
#for each discharge summary, convert tokens to indexes based on vocabulary
for i in range(len(TotalNotes)):
    tokens = TotalNotes['Tokens'][i]
    indexed_document = token_to_index(tokens, extended_vocabulary5)
    hadm_id = TotalNotes.loc[i, 'HADM_ID']
    HADMID_Leven_Dict[hadm_id] = indexed_document
    
#Same as directly above, except for the "Sentences" column
HADMID_LevenSent_Dict = {}
for i in range(len(TotalNotes)): #for each discharge summary
    sentences = TotalNotes['Sentences'][i]
    indexed_document = sentence_to_indexes(sentences, extended_vocabulary5, list_format=False)
    hadm_id = TotalNotes.loc[i, 'HADM_ID']
    HADMID_LevenSent_Dict[hadm_id] = indexed_document

-----------------

# Byte Pair Encoding
Alternative method for dealing with out-of-vocabulary words. To get a better feel for how the algorithm works, please refer to the notebook titled "Byte Pair Encoding"

### First: create vocabulary of "character blocks". Some of these blocks will be full unigrams; other blocks will simply be character n-grams 

In [32]:
#Key parameter is number of merges

#5000 merges
WeirdCts5000 = convert_tokens_into_frequent_character_combinations(Corpus_Counts, 5000)
CharVocabulary5000 = create_vocabulary_of_character_combos(WeirdCts5000)

0
1000
2000
3000
4000


In [33]:
#10000 merges
WeirdCts10000 = convert_tokens_into_frequent_character_combinations(WeirdCts5000, 5000, ongoing=True )
CharVocabulary10000 = create_vocabulary_of_character_combos(WeirdCts10000)

0
1000
2000
3000
4000


In [34]:
#25000 merges
WeirdCts25000 = convert_tokens_into_frequent_character_combinations(WeirdCts10000, 15000, ongoing=True)
CharVocabulary25000 = create_vocabulary_of_character_combos(WeirdCts25000)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000


### Now, translate items from Character Vocabulary to Unigram Vocabulary (named here as byte vocabulary). For example, if "photo" = 352 and "graphy<\w>" = 231, the unigram "photography" should be represented as [352, 231] in the Byte vocabulary.  

In [35]:
ByteVocabulary25000 = map_unigram_to_character_indices(WeirdCts25000, CharVocabulary25000)
ByteVocabulary10000 = map_unigram_to_character_indices(WeirdCts10000, CharVocabulary10000)
ByteVocabulary5000 = map_unigram_to_character_indices(WeirdCts5000, CharVocabulary5000)

print(len(CharVocabulary25000))
print(len(ByteVocabulary25000))
print(len(CharVocabulary10000))
print(len(ByteVocabulary10000))
print(len(CharVocabulary5000))
print(len(ByteVocabulary5000))

24757
157112
10013
157112
5036
157112


In [126]:
WeirdCts10000

{'re do / redo</w>': 2,
 'fr ame -</w>': 1,
 'hemi block</w>': 38,
 'e stro gen / pro ge ster one</w>': 2,
 'nur s ery</w>': 3,
 'thy m om a obese</w>': 1,
 'cit rul lin ated</w>': 1,
 'k it ch en</w>': 142,
 'tak e- home</w>': 1,
 'min d ful</w>': 4,
 'hyper expan ded</w>': 18,
 'im a es</w>': 1,
 '* bms</w>': 5,
 'lar ge - scale</w>': 1,
 'carb am z -</w>': 7,
 'hypo xi a/ flash</w>': 2,
 'hemorrha g e. with</w>': 1,
 'microcy ts</w>': 1,
 'por c ess</w>': 1,
 'wa ff le</w>': 26,
 'pre- mental</w>': 1,
 'nect ar/ thickened</w>': 2,
 'pe t te chi ae</w>': 1,
 '* sed</w>': 1,
 'cit o lo pr am</w>': 1,
 'sub species</w>': 1,
 'mechan ism</w>': 198,
 "ker ni g 's</w>": 1,
 'pir ul ent</w>': 1,
 'splen ic/ portal</w>': 1,
 'drain ag e/ discharge</w>': 1,
 'hypo pne ic</w>': 2,
 'van i q a</w>': 1,
 'sub se q e un tly</w>': 6,
 'anti ver t</w>': 19,
 'gua i af ension</w>': 1,
 'enter o- vaginal</w>': 1,
 'w/ oropharyngeal</w>': 1,
 'ag gr ic ul ture</w>': 5,
 'drain i g</w>': 1,
 'syn chro

In [36]:
#Code for saving the vocabularies
save_as_pkl(ByteVocabulary25000, 'Byte25000Vocabulary')
save_as_pkl(CharVocabulary25000, 'Char25000Vocabulary')

save_as_pkl(ByteVocabulary10000, 'Byte10000Vocabulary')
save_as_pkl(CharVocabulary10000, 'Char10000Vocabulary')

save_as_pkl(ByteVocabulary5000, 'Byte5000Vocabulary')
save_as_pkl(CharVocabulary5000, 'Char5000Vocabulary')

In [37]:
#Count number of unigrams in ByteVocabulary25000 represented by one character block
multiples = 0
singles = 0
for k, v in ByteVocabulary25000.items():
    if len(v) == 1:
        singles = singles+1
    else:
        multiples = multiples+1
print(singles, multiples)

#Count number of unigrams in ByteVocabulary10000 represented by one character block
multiples = 0
singles = 0
for k, v in ByteVocabulary10000.items():
    if len(v) == 1:
        singles = singles+1
    else:
        multiples = multiples+1
print(singles, multiples)

#Count number of unigrams in ByteVocabulary5000 represented by one character block
multiples = 0
singles = 0
for k, v in ByteVocabulary5000.items():
    if len(v) == 1:
        singles = singles+1
    else:
        multiples = multiples+1
print(singles, multiples)

15692 141420
5796 151316
2666 154446


### Dealing with words that only appear in the test set 

In [40]:
tester0 = TotalNotes[TotalNotes['Test']==1]
tester = tester0.reset_index(drop=True)
test_tokens = list(np.unique(np.concatenate(tester['Tokens'])))
unique_test_tokens = [x for x in test_tokens if x not in list(ByteVocabulary5000.keys())]
len(unique_test_tokens)

12322

In [51]:
def find_ngrams(CharVocab, test_tokens):
    '''
    This function attempts to map unknown test set words to combinations of character blocks.
    '''
    non_enders = [bloc for bloc in list(CharVocab.keys()) if '</w>' not in bloc]
    enders = [bloc for bloc in list(CharVocab.keys()) if '</w>' in bloc]

    i = 0
    Ngrams = {}
    for u in test_tokens:
        
        #find best combination of character blocks
        u = u + '</w>'
        Found = False
        first_candidates = [f for f in non_enders if (u[:len(f)] == f)]
        first_candidates.sort(key=len, reverse=True)
    
        for f in first_candidates:
            remaining = u[len(f):]        
        
            #bigrams
            for e in enders:
                if e == remaining:
                    Ngrams[u[:-4]] = [CharVocab[f], CharVocab[e]]
                    Found = True
                    break
            if Found == True:
                break
            
            #trigrams
            second_candidates = [s for s in non_enders if (remaining[:len(s)] == s)]
            second_candidates.sort(key=len, reverse=True)
            for s in second_candidates:
                remaining2 = remaining[len(s):]
                for e in enders:
                    if e == remaining2:
                        Ngrams[u[:-4]] = [CharVocab[f],CharVocab[s],CharVocab[e]]
                        Found = True
                        break
                if Found == True:
                    break
        
                #4-grams
                third_candidates = [t for t in non_enders if (remaining2[:len(t)] == t)]
                third_candidates.sort(key=len, reverse=True)

                for t in third_candidates:
                    last = remaining2[len(t):]
                    for e in enders:
                        if e == last:
                            Ngrams[u[:-4]] = [CharVocab[f], CharVocab[s], 
                                       CharVocab[t], CharVocab[e]]
                            Found = True
                            break
        
            if Found == True:
                break
                
    return Ngrams

In [52]:
NGrams5K = find_ngrams(CharVocabulary5000, unique_test_tokens)
NGrams10K = find_ngrams(CharVocabulary10000, unique_test_tokens)
NGrams25K = find_ngrams(CharVocabulary25000, unique_test_tokens)

In [67]:
ByteVocabulary5000_new = dict(ByteVocabulary5000, **NGrams5K)
ByteVocabulary10000_new = dict(ByteVocabulary10000, **NGrams10K)
ByteVocabulary25000_new = dict(ByteVocabulary25000, **NGrams25K)

In [88]:
#Count number of unigrams in ByteVocabulary25000 represented by one character block
multiples = 0
singles = 0
for k, v in ByteVocabulary25000_new.items():
    if len(v) == 1:
        singles = singles+1
    else:
        multiples = multiples+1
print(singles, multiples)

#Count number of unigrams in ByteVocabulary10000 represented by one character block
multiples = 0
singles = 0
for k, v in ByteVocabulary10000_new.items():
    if len(v) == 1:
        singles = singles+1
    else:
        multiples = multiples+1
print(singles, multiples)

#Count number of unigrams in ByteVocabulary5000 represented by one character block
multiples = 0
singles = 0
for k, v in ByteVocabulary5000_new.items():
    if len(v) == 1:
        singles = singles+1
    else:
        multiples = multiples+1
print(singles, multiples)

15692 153017
5796 161409
2666 162782


In [117]:
Y = WeirdCts5000.copy()
for word, value in Y.items():
    if value == 1484:
        print(word)

occu pati onal</w>
stasis</w>


In [None]:
Corpus_Counts

In [157]:
Vocabulary = []
for key in Corpus_Counts:
    value = Corpus_Counts[key]
    if value>=34:
        Vocabulary.append(key)
print(len(Vocabulary))

15551


In [132]:
len(Corpus_Counts)

157112

### Convert summaries to indexed numbers from Byte Vocabulary  

In [69]:
#Byte with 25000 merges
#Tokens are words
HADMID_Byte25000_Dict = {}
for i in range(len(TotalNotes)): #for each discharge summary
    tokens = TotalNotes['Tokens'][i]
    indexed_document = token_to_index(tokens, ByteVocabulary25000_new, list_format=True)
    hadm_id = TotalNotes.loc[i, 'HADM_ID']
    HADMID_Byte25000_Dict[hadm_id] = indexed_document
    
#For Sentences
HADMID_Byte25000Sent_Dict = {}
for i in range(len(TotalNotes)): #for each discharge summary
    sentences = TotalNotes['Sentences'][i]
    indexed_document = sentence_to_indexes(sentences, ByteVocabulary25000_new, list_format=True)
    hadm_id = TotalNotes.loc[i, 'HADM_ID']
    HADMID_Byte25000Sent_Dict[hadm_id] = indexed_document

In [70]:
#Byte with 10000 merges
#Tokens are words
HADMID_Byte10000_Dict = {}
for i in range(len(TotalNotes)): #for each discharge summary
    tokens = TotalNotes['Tokens'][i]
    indexed_document = token_to_index(tokens, ByteVocabulary10000_new, list_format=True)
    hadm_id = TotalNotes.loc[i, 'HADM_ID']
    HADMID_Byte10000_Dict[hadm_id] = indexed_document
    
#For Sentences
HADMID_Byte10000Sent_Dict = {}
for i in range(len(TotalNotes)): #for each discharge summary
    sentences = TotalNotes['Sentences'][i]
    indexed_document = sentence_to_indexes(sentences, ByteVocabulary10000_new, list_format=True)
    hadm_id = TotalNotes.loc[i, 'HADM_ID']
    HADMID_Byte10000Sent_Dict[hadm_id] = indexed_document

In [71]:
#Byte with 5000 merges
#Tokens are words
HADMID_Byte5000_Dict = {}
for i in range(len(TotalNotes)): #for each discharge summary
    tokens = TotalNotes['Tokens'][i]
    indexed_document = token_to_index(tokens, ByteVocabulary5000_new, list_format=True)
    hadm_id = TotalNotes.loc[i, 'HADM_ID']
    HADMID_Byte5000_Dict[hadm_id] = indexed_document
    
#For Sentences
HADMID_Byte5000Sent_Dict = {}
for i in range(len(TotalNotes)): #for each discharge summary
    sentences = TotalNotes['Sentences'][i]
    indexed_document = sentence_to_indexes(sentences, ByteVocabulary5000_new, list_format=True)
    hadm_id = TotalNotes.loc[i, 'HADM_ID']
    HADMID_Byte5000Sent_Dict[hadm_id] = indexed_document

---------

# Final Experiment: Hybrid Approach
Apply edit distance only when edit distance is less than 2. Otherwise, to deal with the remaining OOVs, use Byte-Pair Encoding. Byte-Pair encoding useful for rare words that are super long.

In [159]:
import Levenshtein #install and import Levenshtein package 
#Link an OOV to its closest vocabulary word only if edit distance is 2 or below. 

OOV_to_Vocab_Dict_SmallDiff5 = {}
i = 0
for token in OOVs5:
    i = i+1
    if i%20000==0:
        print(i)
    edit_dis_list = []
    for vocab in vocabulary5.keys():
        edit_dis = Levenshtein.distance(token, vocab)
        edit_dis_list.append((vocab, edit_dis))
    edit_dis_list = sorted(edit_dis_list, key=lambda tup: tup[1])
    closest_vocab = edit_dis_list[0][0]
    closest_dist = edit_dis_list[0][1]
    if closest_dist < 2:
        OOV_to_Vocab_Dict_SmallDiff5[token] = closest_vocab 

20000
40000
60000
80000
100000
120000


Change text using this new OOV_to_Vocab_Dict_SmallDiff5

In [160]:
def convert_words_in_text(Tokens_List, OOV_to_Vocab_Dict_SmallDiff):
    '''This function replaces OOV words that are close enough to a vocab word to be mapped
    to that vocab word. So [I, love, zfasefvas, New, Yerk] should become 
    [I, love, zfasefvas, New, York]
    
    @Tokens_List is a list of unigram tokens from a particular document.
    '''
    for i in range(len(Tokens_List)):
        if Tokens_List[i] in OOV_to_Vocab_Dict_SmallDiff.keys():
            Tokens_List[i] = OOV_to_Vocab_Dict_SmallDiff[Tokens_List[i]]
    return Tokens_List

def convert_words_in_text_sentence(Sentence_List, OOV_to_Vocab_Dict_SmallDiff):
    '''This function is similar to the one above, but is specifically for a sequence of sentences.
    @Sentence_List is a list of sentences from a particular document.
    '''
    for se in range(len(Sentence_List)):
        tokenized_sentence = word_tokenize(Sentence_List[se])
        for i in range(len(tokenized_sentence)):
            if tokenized_sentence[i] in OOV_to_Vocab_Dict_SmallDiff.keys():
                tokenized_sentence[i] = OOV_to_Vocab_Dict_SmallDiff[tokenized_sentence[i]]
        Sentence_List[se] = ' '.join(tokenized_sentence)
    return Sentence_List

In [164]:
sentences = ['This ias hard.', 'We are in New Yerk.']
oovt = {'ias': 'is', 'Yerk':'York'}
print(convert_words_in_text_sentence(sentences, oovt))
print(convert_words_in_text(sentences[0], oovt))

['This is hard .', 'We are in New York .']
This is hard .


In [165]:
TotalNotes['Tokens-Hybrid'] = TotalNotes['Tokens'].map \
(lambda x: convert_words_in_text(x, OOV_to_Vocab_Dict_SmallDiff5))

TotalNotes['Sentences-Hybrid'] = TotalNotes['Sentences'].map \
(lambda x: convert_words_in_text_sentence(x, OOV_to_Vocab_Dict_SmallDiff5))

Corpus_Counts_Hybrid = Counter()
for i in range(len(TotalNotes)): #for each discharge summary
    if TotalNotes.loc[i, 'Test'] != 1: #if document is NOT in test set
        tokens = TotalNotes['Tokens-Hybrid'][i]
        unigram_counts = Counter(tokens) #count number of unigrams
        Corpus_Counts_Hybrid.update(unigram_counts) #update the Counter for each discharge summary

In [166]:
print(len(Corpus_Counts), len(Corpus_Counts_Hybrid))

157112 104968


In [167]:
#number of OOV words corrected by one edit operation
print(len(OOVs5), len(OOV_to_Vocab_Dict_SmallDiff5))

131315 57201


### Dealing with words that only appear in the test set 

In [188]:
tester0 = TotalNotes[TotalNotes['Test']==1]
tester = tester0.reset_index(drop=True)
test_tokens_hybrid = list(np.unique(np.concatenate(tester['Tokens-Hybrid'])))
unique_test_tokens_hybrid = [x for x in test_tokens_hybrid if x not in list(Corpus_Counts_Hybrid.keys())]
len(unique_test_tokens_hybrid)

7265

In [189]:
len(unique_test_tokens)

12322

### Now apply byte-pair encoding

In [190]:
#10000 merges
WeirdCts10000_Hybrid = convert_tokens_into_frequent_character_combinations(Corpus_Counts_Hybrid, 10000)
HybCharVocabulary10000 = create_vocabulary_of_character_combos(WeirdCts10000_Hybrid)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000


In [191]:
HybridVocabulary10000 = map_unigram_to_character_indices(WeirdCts10000_Hybrid, HybCharVocabulary10000)
print(len(HybridVocabulary10000))
print(len(HybCharVocabulary10000))

save_as_pkl(HybridVocabulary10000, 'HybridVocabulary10000')
save_as_pkl(HybCharVocabulary10000, 'HybCharVocabulary10000')

104968
9977


In [193]:
NGrams10K_Hybrid = find_ngrams(HybCharVocabulary10000, unique_test_tokens_hybrid)

In [197]:
print(len(NGrams10K_Hybrid))
print(len(NGrams10K_Hybrid)/len(unique_test_tokens_hybrid))

5349
0.7362697866483139


In [198]:
HybridVocabulary10000_new = dict(HybridVocabulary10000, **NGrams10K_Hybrid)

In [199]:
#Count number of tokens in HybridVocabulary10000 represented by one character block
multiples = 0
singles = 0
for k, v in HybridVocabulary10000_new.items():
    if len(v) == 1:
        singles = singles+1
    else:
        multiples = multiples+1
print(singles, multiples)

5661 104656


### Convert summaries (in "Hybrid" columns) to indexed numbers from Hybrid Vocabulary  

In [200]:
#Hybrid with 10000 merges
#Tokens are words
HADMID_Hybrid10000_Dict = {}
for i in range(len(TotalNotes)): #for each discharge summary
    tokens = TotalNotes['Tokens-Hybrid'][i]
    indexed_document = token_to_index(tokens, HybridVocabulary10000_new, list_format=True)
    hadm_id = TotalNotes.loc[i, 'HADM_ID']
    HADMID_Hybrid10000_Dict[hadm_id] = indexed_document
    
#For Sentences
HADMID_Hybrid10000Sent_Dict = {}
for i in range(len(TotalNotes)): #for each discharge summary
    sentences = TotalNotes['Sentences-Hybrid'][i]
    indexed_document = sentence_to_indexes(sentences, HybridVocabulary10000_new, list_format=True)
    hadm_id = TotalNotes.loc[i, 'HADM_ID']
    HADMID_Hybrid10000Sent_Dict[hadm_id] = indexed_document

----------------

# Create DataFrames so that:
- Each row is a HADM_ID; 
- Columns for each row: the SubjectID, Summary Text in fixed vocabulary (token and sentence form), Summary Text using byte pair encoding (token and sentence form), and ICD-9 Codes 

In [201]:
#Codes
HADMID_Code_DF = pd.DataFrame(pd.Series(HADMID_Code_Dict)).\
                reset_index().rename(columns={0: 'ICD9_Codes','index': 'HADM_ID'})

def create_separate_df(token_dict, sent_dict, IDs, codes, version):
    '''This function creates a separare dataframe for each of the 5 preprocessing techniques we will try:
    1) fixed vocab with padding; 2) use edit distance for OOV words; 3) Byte pair encoding with 
    30K merges; 4) Byte pair encoding with 50K merges; and 5) hybrid approach that combines edit distance
    and byte pair encoding. 
    
    For each processing technique, we'll generate a dataframe where each row corresponds
    to a discharge summary, with columns for HADM_ID, SUBJ_ID, Summary Text tokenized by word/character 
    block, Summary Text tokenized by sentence, and the ICD codes.
    
    @token_dict is the dictionary of summaries tokenized by word
    @sent_dict is the dictionary of summaries tokenized by sentence
    @IDs is a df indicating if HADM_ID corresponds to train or test set
    @codes is a df listing the ICD9 codes for each HADM_ID
    @version can be 'PlainVocab', 'Leven', 'Byte50K', 'Byte25K', or 'Hybrid20K'
    '''

    token_df = pd.DataFrame(pd.Series(token_dict)).\
                 reset_index().rename(columns={'index': 'HADM_ID', 0:'Tokens-'+version})
    sent_df = pd.DataFrame(pd.Series(sent_dict)).\
                 reset_index().rename(columns={'index': 'HADM_ID', 0:'Sentences-'+version})
    final_df = pd.merge(token_df, sent_df, on='HADM_ID')  
    final_df = pd.merge(IDs, final_df, on='HADM_ID').sort_values(by=['Test','HADM_ID']).reset_index(drop=True)
    
    final_df = pd.merge(final_df, codes, on='HADM_ID')
    final_df['Length Tokens'] = final_df['Tokens-'+version].map(lambda x: len(x))
    final_df['Unique Tokens'] = final_df['Tokens-'+version].map(lambda x: len(np.unique(x)))
    final_df['ICD9_Codes_str'] = final_df['ICD9_Codes'].map(lambda x: ' '.join(x))
    final_df['Length Sentences'] = final_df['Sentences-'+version].map(lambda x: len(x))


    
    #Create Validation Set
    final_df = final_df.sort_values(by=['Test','HADM_ID']).reset_index(drop=True)
    NumTrain = round(0.7*len(final_df))
    final_df['Test'] = np.where((final_df.index >= NumTrain)&(final_df['Test']==0), 0.5, final_df['Test'])
    return final_df


Full_PlainVocab = create_separate_df(HADMID_Text_Dict, HADMID_TextSent_Dict, IDs, HADMID_Code_DF,
                                      'PlainVocab')

Full_Leven = create_separate_df(HADMID_Leven_Dict, HADMID_LevenSent_Dict, IDs, HADMID_Code_DF,
                                      'Leven')

Full_Byte5K = create_separate_df(HADMID_Byte5000_Dict, HADMID_Byte5000Sent_Dict, 
                                    IDs, HADMID_Code_DF, 'Byte5K')

Full_Byte10K = create_separate_df(HADMID_Byte10000_Dict, HADMID_Byte10000Sent_Dict, 
                                    IDs, HADMID_Code_DF, 'Byte10K')

Full_Byte25K = create_separate_df(HADMID_Byte25000_Dict, HADMID_Byte25000Sent_Dict, 
                                    IDs, HADMID_Code_DF, 'Byte25K')

Full_Hybrid10K = create_separate_df(HADMID_Hybrid10000_Dict, HADMID_Hybrid10000Sent_Dict, 
                                   IDs, HADMID_Code_DF, 'Hybrid10K')



print('Rows in Final DF for MIMIC II: '+str(len(Full_Leven)),
     '\nRows in MIMIC II Training Set: '+str(len(Full_Leven[Full_Leven['Test']==0])),
    '\nRows in MIMIC II Validation Set: '+str(len(Full_Leven[Full_Leven['Test']==0.5])),
     '\nRows in MIMIC II Test Set: '+str(len(Full_Leven[Full_Leven['Test']==1])))

Rows in Final DF for MIMIC II: 29683 
Rows in MIMIC II Training Set: 20778 
Rows in MIMIC II Validation Set: 6075 
Rows in MIMIC II Test Set: 2830


### Analysis of Sets 

In [202]:
print(Full_Leven['Length Tokens'].mean())
print(Full_Byte5K['Length Tokens'].mean())
print(Full_Byte10K['Length Tokens'].mean())
print(Full_Byte25K['Length Tokens'].mean())
print(Full_Hybrid10K['Length Tokens'].mean())

print(Full_Byte5K['Length Sentences'].mean())

2018.591887612438
2303.1586429943063
2123.986221069299
2045.2408786173905
2121.137924064279
147.50143179597748


In [203]:
def count_unknowns(token_doc):
    '''Count the number of unknown OOV tokens for a particular summary'''
    return len([x for x in token_doc if x==1])
    
Full_PlainVocab['UNKs'] = Full_PlainVocab['Tokens-PlainVocab'].map(lambda x: count_unknowns(x))
print(Full_PlainVocab['Length Tokens'].mean())
print(Full_PlainVocab['UNKs'].mean())

2018.591887612438
6.7230064346595695


-----------

# Build subset of TotalChart for testing  

In [204]:
def create_sub_df(full_chart):
    '''This function creates a very small df, for debugging purposes
    350 training observations, 100 validation, 50 test.
    '''
    fc = full_chart[full_chart['Length Sentences']<=120]
    SubTrain = fc[fc['Test']==0].head(6300)
    SubValid = fc[fc['Test']==0.5].head(1800)
    SubTest = fc[fc['Test']==1]
    SubDF = SubTrain.append(SubValid.append(SubTest))    
    sub_df = SubDF.reset_index(drop=True)
    return sub_df

def create_tiny_df(full_chart):
    '''This function creates a very small df, for debugging purposes
    350 training observations, 100 validation, 50 test.
    '''
    training = full_chart.head(350)
    test = full_chart.tail(50)
    validation = full_chart[full_chart['Test']==0.5].head(100)

    small_df = training.append(validation).append(test)
    small_df = small_df.reset_index(drop=True)
    return small_df

def create_medium_df(full_chart):
    '''This function creates a medium df as our backup option.
    7000 training observations, 2000 validation, 1000 test.
    '''
    training = full_chart.head(7000)
    test = full_chart.tail(1000)
    validation = full_chart[full_chart['Test']==0.5].head(2000)
    medium_df = training.append(validation).append(test)
    medium_df = medium_df.reset_index(drop=True)
    return medium_df


def reduce_columns(df, version):
    '''Remove unnecessary columns. Especially columns we may have used in analysis above, 
    but no longer need.'''
    new_df = df[['HADM_ID', 'Test', 'Sentences-'+version, 'ICD9_Codes', 'ICD9_Codes_str']].copy()
    return new_df
    

In [205]:
Tiny_PlainVocab = reduce_columns(create_tiny_df(Full_PlainVocab), 'PlainVocab')
Tiny_Leven = reduce_columns(create_tiny_df(Full_Leven), 'Leven').drop('ICD9_Codes', 1)
Tiny_Byte5K = reduce_columns(create_tiny_df(Full_Byte5K), 'Byte5K').drop('ICD9_Codes', 1)
Tiny_Byte10K = reduce_columns(create_tiny_df(Full_Byte10K), 'Byte10K').drop('ICD9_Codes', 1)
Tiny_Byte25K = reduce_columns(create_tiny_df(Full_Byte25K), 'Byte25K').drop('ICD9_Codes', 1)
Tiny_Hybrid10K = reduce_columns(create_tiny_df(Full_Hybrid10K), 'Hybrid10K').drop('ICD9_Codes', 1)

Med_PlainVocab = reduce_columns(create_medium_df(Full_PlainVocab), 'PlainVocab')
Med_Leven = reduce_columns(create_medium_df(Full_Leven), 'Leven').drop('ICD9_Codes', 1)
Med_Byte5K = reduce_columns(create_medium_df(Full_Byte5K), 'Byte5K').drop('ICD9_Codes', 1)
Med_Byte10K = reduce_columns(create_medium_df(Full_Byte10K), 'Byte10K').drop('ICD9_Codes', 1)
Med_Byte25K = reduce_columns(create_medium_df(Full_Byte25K), 'Byte25K').drop('ICD9_Codes', 1)
Med_Hybrid10K = reduce_columns(create_medium_df(Full_Hybrid10K), 'Hybrid10K').drop('ICD9_Codes', 1)

F_PlainVocab = reduce_columns(Full_PlainVocab, 'PlainVocab')
F_Leven = reduce_columns(Full_Leven, 'Leven').drop('ICD9_Codes', 1)
F_Byte5K = reduce_columns(Full_Byte5K, 'Byte5K').drop('ICD9_Codes', 1)
F_Byte10K = reduce_columns(Full_Byte10K, 'Byte10K').drop('ICD9_Codes', 1)
F_Byte25K = reduce_columns(Full_Byte25K, 'Byte25K').drop('ICD9_Codes', 1)
F_Hybrid10K = reduce_columns(Full_Hybrid10K, 'Hybrid10K').drop('ICD9_Codes', 1)

Sub_PlainVocab = reduce_columns(create_sub_df(Full_PlainVocab), 'PlainVocab')
Sub_Leven = reduce_columns(create_sub_df(Full_Leven), 'Leven').drop('ICD9_Codes', 1)
Sub_Byte5K = reduce_columns(create_sub_df(Full_Byte5K), 'Byte5K').drop('ICD9_Codes', 1)
Sub_Byte10K = reduce_columns(create_sub_df(Full_Byte10K), 'Byte10K').drop('ICD9_Codes', 1)
Sub_Byte25K = reduce_columns(create_sub_df(Full_Byte25K), 'Byte25K').drop('ICD9_Codes', 1)
#Sub_Byte50K = reduce_columns(Full_Byte50K, 'Byte50K').drop('ICD9_Codes', 1)
Sub_Hybrid10K = reduce_columns(create_sub_df(Full_Hybrid10K), 'Hybrid10K').drop('ICD9_Codes', 1)
#Sub_Hybrid20K = reduce_columns(Full_Hybrid20K, 'Hybrid20K').drop('ICD9_Codes', 1)




'''To make things easier, lets merge all these versions together.'''

core_cols = ['HADM_ID', 'Test', 'ICD9_Codes_str']

Tiny = pd.merge(Tiny_PlainVocab, Tiny_Leven.merge(Tiny_Byte5K.merge(Tiny_Byte10K.merge
                (Tiny_Byte25K.merge(Tiny_Hybrid10K, on=core_cols), on=core_cols), on=core_cols),
                on=core_cols), on=core_cols)

Med = pd.merge(Med_PlainVocab, Med_Leven.merge(Med_Byte5K.merge(Med_Byte10K.merge
                (Med_Byte25K.merge(Med_Hybrid10K, on=core_cols), on=core_cols), on=core_cols),
                on=core_cols), on=core_cols)

Full = pd.merge(F_PlainVocab, F_Leven.merge(F_Byte5K.merge(F_Byte10K.merge
                (F_Byte25K.merge(F_Hybrid10K, on=core_cols), on=core_cols), on=core_cols),
                on=core_cols), on=core_cols)

Sub = pd.merge(Sub_PlainVocab, Sub_Leven.merge(Sub_Byte5K.merge(Sub_Byte10K.merge
                (Sub_Byte25K.merge(Sub_Hybrid10K, on=core_cols), on=core_cols), on=core_cols),
                on=core_cols), on=core_cols)

In [540]:
Sub['Length Sentences']

Unnamed: 0_level_0,HADM_ID,Sentences-PlainVocab,ICD9_Codes,ICD9_Codes_str,Sentences-Leven,Sentences-Byte5K,Sentences-Byte10K,Sentences-Byte25K,Sentences-Hybrid10K
Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.0,6300,6300,6300,6300,6300,6300,6300,6300,6300
0.5,1800,1800,1800,1800,1800,1800,1800,1800,1800
1.0,887,887,887,887,887,887,887,887,887


In [206]:
#Remove ICD9 codes that appear ONLY in test set. 

def filter_out_test_set_codes(df):
    '''This function removes codes that appear exclusively in the validation/test sets of a data frame.'''
    new_df = df.copy()
    
    train_set = new_df[new_df['Test']==0].copy()
    test_set = new_df[new_df['Test']!=0].copy().reset_index()
    training_codes = np.unique(np.concatenate(train_set['ICD9_Codes']))
    testing_codes = np.unique(np.concatenate(test_set['ICD9_Codes']))
    
    codes_to_delete = [x for x in testing_codes if x not in training_codes]    
    new_df['ICD9_Codes'] = new_df['ICD9_Codes'].map(lambda x: [code for code in x 
                                                          if code not in codes_to_delete])
    new_df['ICD9_Codes_str'] = new_df['ICD9_Codes'].map(lambda x: ' '.join(x))
    return new_df, codes_to_delete

#------------------------------------------------

Full_DF, Full_deleted_codes = filter_out_test_set_codes(Full)
Med_DF, Med_deleted_codes = filter_out_test_set_codes(Med)
Tiny_DF, Tiny_deleted_codes = filter_out_test_set_codes(Tiny)
S_DF, Sub_deleted_codes = filter_out_test_set_codes(Sub)



print(str(len(Full_deleted_codes)) + ': codes to delete, full')
print(str(len(Med_deleted_codes)) + ': codes to delete, med')
print(str(len(Tiny_deleted_codes)) + ': codes to delete, tiny')
print(str(len(Sub_deleted_codes)) + ': codes to delete, sub')

#print(deleted_codes)

34: codes to delete, full
52: codes to delete, med
39: codes to delete, tiny
51: codes to delete, sub


### To-Do:  find number of labels for each version 

In [21]:
def generate_binary_output(chart):
    '''
    Note: index of input df should be "regular" -- np.arange(len(chart))
    @chart is the big dataframe (including summaries and ICD-9 codes)    
        
    This function returns the df, but creates a "Labels" column where each cell
    contains a binary list (only 1's and 0's).

    Each binary list is ~1000 elements long (representing the number of ICD9 Codes).
    A "1" means the summary is associated with a ICD9 code, "0" otherwise.
    
    For example, if the only codes in the universe were 001, 002, 003, 004, and one summary 
    was associated with 002 and 004, the binary list for that summary would be [0, 1, 0, 1]
    '''
    
    import sklearn
    from sklearn import feature_extraction
    VectorizerCodes = sklearn.feature_extraction.text.CountVectorizer()
    
    matrix = VectorizerCodes.fit_transform(chart['ICD9_Codes_str'])
    colnames = VectorizerCodes.get_feature_names()
    array_out = np.array(matrix.toarray(), dtype=np.float32)    
    Output = pd.DataFrame(array_out, columns = colnames, index = chart.index) #copy index from large table.
    
    Output['Labels'] = Output.index.map(lambda x: list(Output.loc[x,:]))
    O = Output[['Labels']]
    new_chart = pd.merge(chart, O, right_index=True, left_index=True)
    return new_chart, colnames, array_out

In [254]:
Full_FinalDF, FullCodes, f = generate_binary_output(Full_DF)
#Med_FinalDF, MedCodes, _ = generate_binary_output(Med_DF)
#Tiny_FinalDF, TinyCodes, x = generate_binary_output(Tiny_DF)

In [256]:
f.shape

(29683, 937)

In [None]:
CountPerCode = {}
for c in range(f.shape[1]):
    if c % 100 == 0:
        print(c)
    code = f.columns.values[c]
    CountPerCode[code] = f[code].sum()

In [212]:
#Double-checking work
print(Tiny_FinalDF.loc[0,'ICD9_Codes'])
print(TinyCodes.index('401'))
print(Tiny_FinalDF.loc[0,'Labels'][124])

['401', '486', '557', '560', '997']
124
1.0


In [549]:
def find_vocabulary_size(chart, column_name):
    '''
    chart is a pd DataFrame
    This function finds the size of the vocabulary for a particular column.
    column_name should either be "Summary-PlainVocab" or "Summary-Byte50K"
    '''
    return len(np.unique(np.concatenate(chart[column_name].values)))

In [550]:
find_vocabulary_size(DFx, 'Sentences-Leven')

571

In [213]:
def split_train_valid_test(chart):
    '''Takes a df and splits it into train, test, and validation sets based on 
    predetermined column.'''
    train = chart[chart['Test']==0]
    validation = chart[chart['Test']==0.5]
    test = chart[chart['Test']==1]
    
    train = train.drop(['Test', 'HADM_ID', 'ICD9_Codes', 'ICD9_Codes_str'], 1)
    validation = validation.drop(['Test', 'HADM_ID', 'ICD9_Codes', 'ICD9_Codes_str'], 1)
    test = test.drop(['Test', 'HADM_ID', 'ICD9_Codes', 'ICD9_Codes_str'], 1)
    
    return train, validation, test

In [214]:
Tiny_Train, Tiny_Validation, Tiny_Test = split_train_valid_test(Tiny_FinalDF)
Med_Train, Med_Validation, Med_Test = split_train_valid_test(Med_FinalDF)
Full_Train, Full_Validation, Full_Test = split_train_valid_test(Full_FinalDF)

In [215]:
len(Full_FinalDF)

29683

---------------------

# Save chart(s) as pkl files

In [221]:
#save_as_pkl(Tiny_Train, './TextDFs/Tiny/TinyTrain')
#save_as_pkl(Tiny_Validation, './TextDFs/Tiny/TinyValidation')
#save_as_pkl(Tiny_Test, './TextDFs/Tiny/TinyTest')
#save_as_pkl(Med_Train, './TextDFs/Med/MedTrain')
#save_as_pkl(Med_Validation, './TextDFs/Med/MedValidation')
#save_as_pkl(Med_Test, './TextDFs/Med/MedTest')
#save_as_pkl(Full_Train, './TextDFs/Full/FullTrain')
#save_as_pkl(Full_Validation, './TextDFs/Full/FullValidation')
#save_as_pkl(Full_Test, './TextDFs/Full/FullTest')

save_as_pkl(Tiny_Train, './NewDFs/TinyTrain')
save_as_pkl(Tiny_Validation, './NewDFs/TinyValidation')
save_as_pkl(Tiny_Test, './NewDFs/TinyTest')
save_as_pkl(Med_Train, './NewDFs/MedTrain')
save_as_pkl(Med_Validation, './NewDFs/MedValidation')
save_as_pkl(Med_Test, './NewDFs/MedTest')
save_as_pkl(Full_Train, './NewDFs/FullTrain')
save_as_pkl(Full_Validation, './NewDFs/FullValidation')
save_as_pkl(Full_Test, './NewDFs/FullTest')

------------