In [None]:
import pandas as pd
import os
import numpy as np
from gensim.models import Word2Vec, FastText
#import glove
#from glove import Corpus

import collections
import gc

import warnings
warnings.filterwarnings('ignore')

### Process NER clinical notes

In [None]:
# Load data
new_notes = pd.read_pickle("data/ner_df.p")

In [None]:
# Filter out rows with empty 'ner' values
null_index_list = [i.Index for i in new_notes.itertuples() if len(i.ner) == 0]
new_notes.drop(null_index_list, inplace=True)

In [None]:
# Process NER data
med7_ner_data = {}
for ii in new_notes.itertuples():
    p_id = ii.SUBJECT_ID
    ind = ii.Index

    try:
        new_ner = new_notes.loc[ind].ner
    except:
        new_ner = []

    new_temp = [(k[0], k[1]) for j in new_ner for k in j]

    if p_id in med7_ner_data:
        med7_ner_data[p_id].extend(new_temp)
    else:
        med7_ner_data[p_id] = new_temp


In [None]:
# Save processed NER data
pd.to_pickle(med7_ner_data, "data/new_ner_word_dict.pkl")

In [None]:
data_types = [med7_ner_data]
data_names = ["new_ner"]

### Represent medical entites with Word2Vec embedding

In [None]:
# Load data
w2vec = Word2Vec.load("embeddings/word2vec.model")

In [None]:
# Calculate mean for vectors
def mean(a):
    return sum(a) / len(a)

In [None]:
# Process and save Word2Vec embeddings

for data, names in zip(data_types, data_names):
    new_word2vec = {}
    print("w2vec starting..")
    for k, v in data.items():
        patient_temp = []
        if isinstance(v, list):
            for i in v:
                if isinstance(i, tuple) and len(i) == 2 and isinstance(i[0], str):
                    if i[0] in w2vec.wv:
                        patient_temp.append(w2vec.wv[i[0]])
                    elif len(i[0].split(" ")) > 1:
                        avg = []
                        words = i[0].split(" ")
                        num = 0
                        for each_word in words:
                            if each_word in w2vec.wv:
                                temp = w2vec.wv[each_word]
                                avg.append(temp)
                                num += 1
                        if num > 0:
                            avg = np.asarray(avg)
                            t = np.asarray(list(map(mean, zip(*avg))))
                            patient_temp.append(t)
        if patient_temp:
            new_word2vec[k] = patient_temp

    print(f"Number of Word2Vec embeddings: {len(new_word2vec)}")
    pd.to_pickle(new_word2vec, "data/"+names+"_word2vec_dict.pkl")

### Represent medical entites with FastText embedding

In [None]:
# Load data
fasttext = FastText.load("embeddings/fasttext.model")

In [None]:
# Process and save FastText embeddings

for data, names in zip(data_types, data_names):
    new_fasttextvec = {}
    print("fasttext starting..")

    for k,v in data.items():
        patient_temp = []
        for i in v:
            try:
                patient_temp.append(fasttext.wv[i[0]])
            except:
                pass
        if len(patient_temp) == 0: continue
        new_fasttextvec[k] = patient_temp

    print(f"Number of Fasttext embeddings: {len(new_fasttextvec)}")
    pd.to_pickle(new_fasttextvec, "data/"+names+"_fasttext_dict.pkl")

### Represent medical entites with combined Word2Vec + FastText embedding

In [None]:
# Process and save concatenated embeddings

for data, names in zip(data_types, data_names):
    print("combined starting..")
    new_concatvec = {}

    for k,v in data.items():
        patient_temp = []
    #     if k != 6: continue
        for i in v:
            w2vec_temp = []
            try:
                w2vec_temp = w2vec.wv[i[0]]
            except:
                avg = []
                num = 0
                temp = []

                if len(i[0].split(" ")) > 1:
                    for each_word in i[0].split(" "):
                        try:
                            temp = w2vec.wv[each_word]
                            avg.append(temp)
                            num += 1
                        except:
                            pass
                    if num == 0:
                        w2vec_temp = [0] * 100
                    else:
                        #print(f"i: {i[0]}")
                        avg = np.asarray(avg)
                        #w2vec_temp = np.asarray(map(mean, zip(*avg)))
                        w2vec_temp = np.mean(avg, axis=0)
                        #print(f"w2v: {w2vec_temp}")
                else:
                    w2vec_temp = [0] * 100
            try:
                fasttemp = fasttext.wv[i[0]]
            except:
                fasttemp = [0] * 100

            #print(f"i[0]: {i[0]}")
            #print(f"Length of w2v: {len(w2vec_temp)}")
            #print(f"Length of fasttext: {len(fasttemp)}")
            appended = np.append(fasttemp, w2vec_temp, 0)
            patient_temp.append(appended)
        if len(patient_temp) == 0: continue
        new_concatvec[k] = patient_temp

    print(f"Number of concatenated embeddings: {len(new_concatvec)}")
    pd.to_pickle(new_concatvec, "data/"+names+"_combined_dict.pkl")

### Standardize all 3 embeddings
Remove key-value pairs from fasttext embeddings and combined embeddings for keys that are not present in word2vec embeddings

In [None]:
new_fasttext_dict = new_fasttextvec.copy()
new_word2vec_dict =  new_word2vec.copy()
new_combined_dict = new_concatvec.copy()

In [None]:
diff = set(new_fasttext_dict.keys()).difference(set(new_word2vec_dict))
for i in diff:
    del new_fasttext_dict[i]
    del new_combined_dict[i]
print (len(new_word2vec_dict), len(new_fasttext_dict), len(new_combined_dict))

pd.to_pickle(new_word2vec_dict, "data/"+"new_ner"+"_word2vec_limited_dict.pkl")
pd.to_pickle(new_fasttext_dict, "data/"+"new_ner"+"_fasttext_limited_dict.pkl")
pd.to_pickle(new_combined_dict, "data/"+"new_ner"+"_combined_limited_dict.pkl")