In [1]:
from collections import defaultdict
import csv

import pandas as pd
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
from nltk import RegexpTokenizer

import torch
import torch.nn as nn

from utils import label_to_y, word_embeddings, pad_x
import utils

# 1. Data Preprocessing


load tokens with labels

In [2]:
d = pd.read_csv("data/tokens_stopwords_removed_with_label_full.csv")
d = d.rename(columns={"TOKENS_SW_RMED": "TOKENS"})
d = d[['SUBJECT_ID', 'HADM_ID', 'TOKENS', 'LABEL']]

In [4]:
d['TOKENS'] = d['TOKENS'].str.split(',')
d['LABEL'] = d['LABEL'].apply(lambda x: x.split(',') if isinstance(x, str) else x)
d['LABEL'] = d['LABEL'].apply(lambda x: [x] if isinstance(x, float) else x)

In [5]:
num_ad = d.shape[0]

trunctate by max length of 2500

In [6]:
d['Tokens_Tr'] = d['TOKENS'].apply(lambda x: x[:2500])
d = d[['SUBJECT_ID', 'HADM_ID', 'Tokens_Tr', 'LABEL', 'TOKENS']]

filter top 50 labels

In [7]:
count_labels = defaultdict(int)
set_labels = set()

for i in range(num_ad):
    labels = d.iloc[i, 3]
    for l in labels:
        set_labels.add(l)
        count_labels[l] += 1

sorted_labels = sorted(count_labels.items(), key=lambda x:x[1], reverse=True)
top_50_labels = sorted_labels[:50]
top_50_labels = [x[0] for x in top_50_labels]

In [8]:
def filter_label(x):
    result = []
    for xx in x:
        if xx in top_50_labels:
            result.append(xx)
    if result == []:
        return False
    return result

In [9]:
d['Filtered_Label'] = d['LABEL'].apply(filter_label)
filtered_df = d[d['Filtered_Label'] != False]

create a dict of labels map

In [10]:
num_ad_filtered = filtered_df.shape[0]
set_labels = set()

for i in range(num_ad_filtered):
    labels = filtered_df.iloc[i, 5]
    for l in labels:
        set_labels.add(l)
num_labels = len(set_labels)

In [11]:
dict_labels = dict(zip(set_labels, range(num_labels)))

# 2. Split into train/test/valid

load lists of ids

In [12]:
id_train = pd.read_csv("data/train_50_hadm_ids.csv", header=None)[0].to_list()
id_valid = pd.read_csv("data/dev_50_hadm_ids.csv", header=None)[0].to_list()
id_test = pd.read_csv("data/test_50_hadm_ids.csv", header=None)[0].to_list()

len(id_train),len(id_valid),len(id_test)

(8066, 1573, 1729)

In [13]:
df_train = filtered_df[filtered_df['HADM_ID'].isin(id_train)]
df_valid = filtered_df[filtered_df['HADM_ID'].isin(id_valid)]
df_test = filtered_df[filtered_df['HADM_ID'].isin(id_test)]

df_all = pd.concat([df_train, df_valid, df_test])

dict of vocab

In [14]:
num_ad = df_all.shape[0]
set_vocab = set()

for i in range(num_ad):
    tks = df_all.iloc[i, 2]
    for t in tks:
        set_vocab.add(t)

num_vocab = len(set_vocab)
dict_vocab = dict(zip(set_vocab, range(num_vocab)))
dict_vocab = {k : (v+1) for k,v in dict_vocab.items()}

map tokens to ids

In [17]:
df_all['Ids'] = df_all['Tokens_Tr'].apply(lambda x: [dict_vocab[t] for t in x])

save to local file for further use

In [26]:
# df_all.to_csv("data/for_training/data_for_nn.csv")

split into X and y

In [18]:
df_train = df_all[df_all['HADM_ID'].isin(id_train)]
df_valid = df_all[df_all['HADM_ID'].isin(id_valid)]
df_test = df_all[df_all['HADM_ID'].isin(id_test)]

In [19]:
X_train = df_train['Ids'].to_numpy()
X_valid = df_valid['Ids'].to_numpy()
X_test = df_test['Ids'].to_numpy()

y_train = label_to_y(df_train, dict_labels)
y_valid = label_to_y(df_valid, dict_labels)
y_test = label_to_y(df_test, dict_labels)

In [20]:
X_train.shape, X_valid.shape, X_test.shape

((7061,), (1369,), (1504,))

pad zeros

In [21]:
max_tokens = max([len(_) for _ in df_all['Tokens_Tr']])  # 2500 

In [22]:
X_train_pad = pad_x(X_train, max_tokens)
X_valid_pad = pad_x(X_valid, max_tokens)
X_test_pad = pad_x(X_test, max_tokens)

transform to input tensors

In [23]:
X_train = torch.from_numpy(X_train_pad)
X_valid = torch.from_numpy(X_valid_pad)
X_test = torch.from_numpy(X_test_pad)

y_train = torch.from_numpy(y_train)
y_valid = torch.from_numpy(y_valid)
y_test = torch.from_numpy(y_test)

In [40]:
torch.save(X_train, 'data/for_training/X_train50.pt')
torch.save(X_valid, 'data/for_training/X_valid50.pt')
torch.save(X_test, 'data/for_training/X_test50.pt')

torch.save(y_train, 'data/for_training/y_train50.pt')
torch.save(y_valid, 'data/for_training/y_valid50.pt')
torch.save(y_test, 'data/for_training/y_test50.pt')

# 3. Word Embeddings

In [72]:
wordEmbeddingsMatrix = utils.word_embeddings(df_all['Tokens_Tr'])

# 4. ICD Code Descriptions

In [24]:
def code_map_desc(codes):
    desc_dict = {}
    with open('data/ICD_descriptions.txt', 'r') as f:
        for i, row in enumerate(f):
            row = row.strip().split()
            cd = row[0]
            if cd in codes:
                desc_dict[cd] = ' '.join(row[1:])
                
    return desc_dict

In [25]:
desc_data = code_map_desc(top_50_labels)
desc_data = {dict_labels[k]:v for k,v in desc_data.items()}

In [26]:
tokenizer = RegexpTokenizer(r'\d*[a-zA-Z]+\d*')
desc_data = {k:tokenizer.tokenize(v) for k,v in desc_data.items()}
desc_data = {k:[dict_vocab[x] if x in dict_vocab.keys() else 0 for x in v] for k,v in desc_data.items()}

In [27]:
sorted_key = sorted(desc_data)
desc_data = {k:desc_data[k] for k in sorted_key}

In [31]:
desc_data = {k:[str(x) for x in v] for k,v in desc_data.items()}

save to local

In [33]:
# with open('data/desc_data.csv', 'w') as csv_file:  
#     writer = csv.writer(csv_file)
#     for key, value in desc_data.items():
#         writer.writerow([key, ' '.join(value)])