In [1]:
import os
import json
import torch
import utils
import numpy as np
from tqdm import tqdm
from collections import Counter

Microsoft Visual C++ Redistributable is not installed, this may lead to the DLL load failure.
                 It can be downloaded at https://aka.ms/vs/16/release/vc_redist.x64.exe


In [6]:
#Load the cui data
# cuidata, _ = utils.split_cui_present_absent(True)
DATA_PATH = "P:\ORD_Singh_201911038D\Justin\Stroke_Notes_13OCT21"
with open("filtered_sids_to_cuis.json",'r') as infile:
    cuidata = json.load(infile)

In [7]:
#print(list(cuidata)[0]) # print the first document ID
print(cuidata[list(cuidata)[0]][:10]) # show the first 10 cuis of this specific document

['C0008031', 'C1507320', 'C0015031', 'C4718442', 'C2707412', 'C0281822', 'C0281822', 'C0398266', 'C3244243', 'C0917798']


In [8]:
# We will get training pairs. One is the context term the next is the target term.
trainingpairs = []
vocab = dict()
frequency = Counter()
idx = 0

# This could absolutely be accomplished in an alternative / faster way but it's sufficiently fast for now
for doc in tqdm(cuidata):
    cuis = cuidata[doc]
    cuiids = []
    for x in cuis:
        if x not in vocab.keys():
            vocab.update({x:idx})
            idx+=1
        cuiids += [vocab[x]]
    for i in range(len(cuiids)-1):
        pair = cuiids[i:i+2]
        if len(pair) < 2:
            continue
        if pair[0] != pair[1]:
            trainingpairs += [pair] #no self references
            frequency.update(pair)
print(trainingpairs[:3])
print(len(trainingpairs))
print(len(vocab))
assert np.all(np.asarray([x for x in vocab.values()]) == np.arange(len(vocab)))
# We'll use an embedding dimension of 50 to start
EMBEDDING_DIM = 100

100%|██████████████████████████████████████████████████████████████████████████| 29865/29865 [00:25<00:00, 1168.33it/s]

[[0, 1], [1, 2], [2, 3]]
3038627
111303





In [9]:
frequency = np.asarray([values for key,values in frequency.items()])**0.75
frequency /= np.linalg.norm(frequency, ord=1)
frequency = torch.from_numpy(frequency)
samplingids = torch.arange(0, len(frequency))

In [10]:
class CUIEmbeddingModel(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CUIEmbeddingModel, self).__init__()
        self.dim = embedding_dim
        self.in_embeddings = torch.nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.out_embeddings = torch.nn.Embedding(vocab_size, embedding_dim, sparse=True)
        
    def init(self):
        self.in_embeddings.weight.data.uniform_(-0.5/self.dim, 0.5/self.dim) # scaled by dimensionality to control initial norm.
        self.out_embeddings.weight.data.uniform(-0, 0) # all 0s
        
    def forward(self, inputs, targets, negatives):
        inembed = self.in_embeddings(inputs)
        outembed = self.out_embeddings(targets)
        pos_score = torch.sum(torch.mul(inembed, outembed), dim=1)
        pos_score = torch.nn.functional.logsigmoid(pos_score)
        negembed = self.out_embeddings(negatives)
        neg_score = torch.bmm(negembed, inembed.unsqueeze(2)).squeeze()
        neg_score = torch.nn.functional.logsigmoid(-1*neg_score)
        # skip gram negative sampling
        return -1 * (torch.sum(pos_score)+torch.sum(neg_score))
    

In [11]:
losses = []
model = CUIEmbeddingModel(len(vocab), EMBEDDING_DIM)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
NEGSAMPLES = 2

In [12]:
#Test batching (also gives you an idea of theoretical max throughput in a way)
BATCH_SIZE=2000
for batchidx in tqdm(np.arange(0, len(trainingpairs), BATCH_SIZE), total = len(trainingpairs)//BATCH_SIZE):
    data = trainingpairs[batchidx:batchidx+BATCH_SIZE]
    context_idxs = torch.tensor(data)

1520it [00:01, 901.67it/s]                                                                                             


In [13]:
def nearest_embedding_search(cui):
    #C0948008 = ischemic stroke
    for i in zip(*torch.topk(torch.nn.functional.cosine_similarity(model.in_embeddings.weight.data[vocab[cui]].view(1,-1), 
                                                                   model.in_embeddings.weight.data), 10, largest=True)):
        try:
            print(f'{i[0]:0.3f}\t{cuitranslate[id2vocab[int(i[1])]]}')
        except:
            print("key error", id2vocab[int(i[1])])

In [14]:
id2vocab = {value:key for key,value in vocab.items()}
with open(os.path.join(DATA_PATH, "cuitranslate.json"),'r') as infile:
    cuitranslate = json.load(infile)

In [15]:
print(list(id2vocab.items())[0:10])
print(list(cuitranslate.items())[0:10])
print(cuitranslate["C0000005"])

[(0, 'C0008031'), (1, 'C1507320'), (2, 'C0015031'), (3, 'C4718442'), (4, 'C2707412'), (5, 'C0281822'), (6, 'C0398266'), (7, 'C3244243'), (8, 'C0917798'), (9, 'C0013227')]
[('C0000005', '(131)I-Macroaggregated Albumin'), ('C0000039', '1,2-dipalmitoylphosphatidylcholine'), ('C0000052', '1,4-alpha-Glucan Branching Enzyme'), ('C0000074', '1-Alkyl-2-Acylphosphatidates'), ('C0000084', '1-Carboxyglutamic Acid'), ('C0000096', '1-Methyl-3-isobutylxanthine'), ('C0000097', '1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine'), ('C0000098', '1-Methyl-4-phenylpyridinium'), ('C0000102', '1-Naphthylamine'), ('C0000103', '1-Naphthylisothiocyanate')]
(131)I-Macroaggregated Albumin


In [16]:
trainingpairs = torch.LongTensor(trainingpairs)
for epoch in range(50):
    total_loss = 0
    shuffler = torch.randperm(trainingpairs.shape[0])
    trainingpairs = trainingpairs[shuffler].view(trainingpairs.size())
    with tqdm(np.arange(0, len(trainingpairs), BATCH_SIZE), desc=f'Epoch {epoch+1}', total = len(trainingpairs)//BATCH_SIZE) as progress: #goes one example at a time
        for batchidx in progress:
            data = trainingpairs[batchidx:batchidx+BATCH_SIZE]
            inputs = torch.cat((data[:, 0], data[:,1])) #we'll go bidirectional; usually not done I suppose
            targets = torch.cat((data[:, 1], data[:,0]))#doubles the batch size
            negatives = samplingids[frequency.multinomial(num_samples=inputs.shape[0]*NEGSAMPLES, replacement=True)].reshape(inputs.shape[0], NEGSAMPLES)
            optimizer.zero_grad()
            loss = model.forward(inputs, targets, negatives)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            progress.set_postfix(loss=f"{loss.item():3.3f}")
        losses.append(total_loss)
        progress.set_postfix(loss=f"{total_loss:3.3f}")
    nearest_embedding_search('C0948008')
    
print(losses)


Epoch 1: 1520it [06:25,  3.94it/s, loss=8109.819]                                                                      


1.000	Ischemic stroke
0.405	lung; absence, acquired (any part)
0.396	IRRIGATION KIT W/PISTON SYRINGE K#68800
0.395	Calcification of pleura
0.391	Malignant neoplasm of anorectum
0.387	Difficulty verbalizing
0.386	Healing pressure ulcer of right heel NOS
0.380	Longitudinal absence of radius AND ulna
0.373	injury of multiple spinal nerve roots
0.372	functional limitations in self-care


Epoch 2: 1520it [06:25,  3.94it/s, loss=7390.028]                                                                      


1.000	Ischemic stroke
0.407	Difficulty verbalizing
0.404	IRRIGATION KIT W/PISTON SYRINGE K#68800
0.404	lung; absence, acquired (any part)
0.382	Ensure Plus Therapeutic Nutrition
0.380	Calcification of pleura
0.380	coronary angiography: coronary artery bypass graft origin left internal mammary artery insertion site visualized proximal left anterior descending
0.380	Utilize standardized abbreviations, acronyms, and symbols
0.375	Nemaline Myopathy 2
0.368	Patchy erythematous esophageal mucosa


Epoch 3: 1520it [06:13,  4.06it/s, loss=6016.769]                                                                      


1.000	Ischemic stroke
0.433	IRRIGATION KIT W/PISTON SYRINGE K#68800
0.407	elongation factor DmS-II
0.405	Nemaline Myopathy 2
0.402	Hypothyroidism
0.392	Acute Cerebrovascular Accidents
0.377	Human Placental Lactogen
0.367	On Campus-Outpatient Hospital
0.367	Low-Calorie Diet
0.366	Anemia in chronic kidney disease


Epoch 4: 1520it [06:25,  3.94it/s, loss=5917.967]                                                                      


1.000	Ischemic stroke
0.547	Acute Cerebrovascular Accidents
0.542	Transient Ischemic Attack
0.525	Rehabilitation therapy
0.522	Cerebrovascular accident
0.515	Hyperlipidemia
0.498	Left hemiparesis
0.491	alteplase
0.489	Imagent
0.485	Facial Paresis


Epoch 5: 1520it [06:24,  3.95it/s, loss=5308.860]                                                                      


1.000	Ischemic stroke
0.604	Acute Cerebrovascular Accidents
0.590	Diabetes Mellitus, Non-Insulin-Dependent
0.556	Imagent
0.554	Cerebrovascular accident
0.546	Coronary Artery Disease
0.538	CAD gene
0.527	decimeter
0.524	X-Ray Computed Tomography
0.524	alteplase


Epoch 6: 1520it [06:27,  3.92it/s, loss=5219.870]                                                                      


1.000	Ischemic stroke
0.733	Acute Cerebrovascular Accidents
0.688	Cerebrovascular accident
0.683	Transient Ischemic Attack
0.680	Left hemiparesis
0.675	Work-up
0.667	Coronary Artery Disease
0.656	Right hemiparesis
0.640	decimeter
0.635	CAD gene


Epoch 7: 1520it [06:19,  4.00it/s, loss=5223.314]                                                                      


1.000	Ischemic stroke
0.770	Acute Cerebrovascular Accidents
0.732	Work-up
0.717	Transient Ischemic Attack
0.705	Left hemiparesis
0.699	CAD gene
0.695	Right hemiparesis
0.686	Rehabilitation therapy
0.680	Coronary Artery Disease
0.677	decimeter


Epoch 8: 1520it [06:16,  4.04it/s, loss=4707.375]                                                                      


1.000	Ischemic stroke
0.763	Left hemiparesis
0.751	Acute Cerebrovascular Accidents
0.745	Work-up
0.738	Acute ischemic stroke subtype
0.718	X-Ray Computed Tomography
0.712	Transient Ischemic Attack
0.711	Atrial Fibrillation
0.706	CANCER WORKUP
0.695	CAD gene


Epoch 9: 1520it [06:26,  3.93it/s, loss=4746.134]                                                                      


1.000	Ischemic stroke
0.807	Acute Cerebrovascular Accidents
0.761	Acute ischemic stroke subtype
0.759	Right hemiparesis
0.757	Cerebrovascular accident
0.749	Transient Ischemic Attack
0.745	Work-up
0.736	Further
0.733	Aphasia
0.729	Carcinoembryonic Antigen


Epoch 10: 1520it [06:02,  4.19it/s, loss=4539.793]                                                                     


1.000	Ischemic stroke
0.856	Acute Cerebrovascular Accidents
0.844	Acute ischemic stroke subtype
0.834	Left hemiparesis
0.821	Right hemiparesis
0.804	Cerebrovascular accident
0.794	neurological weakness of the right or left side
0.789	Carcinoembryonic Antigen
0.785	Evaluation
0.783	Aphasia


Epoch 11: 1520it [06:07,  4.14it/s, loss=4327.889]                                                                     


1.000	Ischemic stroke
0.824	Acute ischemic stroke subtype
0.823	Work-up
0.821	Left hemiparesis
0.820	Aphasia
0.811	Acute Cerebrovascular Accidents
0.801	neurological weakness of the right or left side
0.801	Cerebrovascular accident
0.794	Transient Ischemic Attack
0.789	CANCER WORKUP


Epoch 12: 1520it [06:33,  3.86it/s, loss=4002.604]                                                                     


1.000	Ischemic stroke
0.839	Acute Cerebrovascular Accidents
0.827	Transient Ischemic Attack
0.824	Acute ischemic stroke subtype
0.824	Aphasia
0.823	Cerebrovascular accident
0.819	Right hemiparesis
0.819	Left hemiparesis
0.818	WEAKNESS RESIDUAL
0.814	neurological weakness of the right or left side


Epoch 13: 1520it [06:30,  3.89it/s, loss=4179.090]                                                                     


1.000	Ischemic stroke
0.866	Acute Cerebrovascular Accidents
0.845	neurological weakness of the right or left side
0.840	Transient Ischemic Attack
0.832	alteplase
0.828	Evaluation
0.827	Left hemiparesis
0.824	Acute ischemic stroke subtype
0.822	Aphasia
0.822	Right hemiparesis


Epoch 14: 1520it [06:09,  4.11it/s, loss=4215.413]                                                                     


1.000	Ischemic stroke
0.868	Acute Cerebrovascular Accidents
0.833	Acute ischemic stroke subtype
0.832	neurological weakness of the right or left side
0.827	alteplase
0.827	CANCER WORKUP
0.823	Transient Ischemic Attack
0.820	Left hemiparesis
0.817	WEAKNESS RESIDUAL
0.805	Right hemiparesis


Epoch 15: 1520it [06:20,  4.00it/s, loss=3947.207]                                                                     


1.000	Ischemic stroke
0.879	Acute Cerebrovascular Accidents
0.842	Acute ischemic stroke subtype
0.835	CANCER WORKUP
0.834	alteplase
0.828	neurological weakness of the right or left side
0.825	Bilateral stenosis of carotid arteries
0.825	Right carotid artery stenosis
0.824	Aphasia
0.821	WEAKNESS RESIDUAL


Epoch 16: 1520it [06:23,  3.97it/s, loss=3686.010]                                                                     


1.000	Ischemic stroke
0.875	Acute Cerebrovascular Accidents
0.848	Transient Ischemic Attack
0.844	Acute ischemic stroke subtype
0.842	WEAKNESS RESIDUAL
0.839	neurological weakness of the right or left side
0.838	Left hemiparesis
0.831	Work-up
0.831	Cardioembolic Stroke
0.829	Aphasia


Epoch 17: 1520it [06:32,  3.87it/s, loss=3647.866]                                                                     


1.000	Ischemic stroke
0.902	Acute Cerebrovascular Accidents
0.884	CANCER WORKUP
0.874	Acute ischemic stroke subtype
0.867	Transient Ischemic Attack
0.865	neurological weakness of the right or left side
0.862	Right hemiparesis
0.857	Left hemiparesis
0.856	WEAKNESS RESIDUAL
0.850	Work-up


Epoch 18: 1520it [06:16,  4.04it/s, loss=3782.953]                                                                     


1.000	Ischemic stroke
0.909	Acute Cerebrovascular Accidents
0.883	Acute ischemic stroke subtype
0.874	CANCER WORKUP
0.864	Right hemiparesis
0.862	Left hemiparesis
0.862	Transient Ischemic Attack
0.857	Cardioembolic Stroke
0.851	Evaluation
0.843	Structure of middle cerebral artery


Epoch 19: 1520it [06:11,  4.09it/s, loss=3495.187]                                                                     


1.000	Ischemic stroke
0.892	Acute Cerebrovascular Accidents
0.882	Acute ischemic stroke subtype
0.871	Left hemiparesis
0.860	neurological weakness of the right or left side
0.858	Right hemiparesis
0.854	Cardioembolic Stroke
0.848	CANCER WORKUP
0.845	WEAKNESS RESIDUAL
0.844	Evaluation


Epoch 20: 1520it [09:02,  2.80it/s, loss=3241.369]                                                                     


1.000	Ischemic stroke
0.907	Acute Cerebrovascular Accidents
0.878	Acute ischemic stroke subtype
0.864	neurological weakness of the right or left side
0.858	CANCER WORKUP
0.857	Transient Ischemic Attack
0.851	Left hemiparesis
0.850	WEAKNESS RESIDUAL
0.846	X-Ray Computed Tomography
0.844	alteplase


Epoch 21: 1520it [10:35,  2.39it/s, loss=3292.152]                                                                     


1.000	Ischemic stroke
0.894	Acute Cerebrovascular Accidents
0.885	Acute ischemic stroke subtype
0.875	neurological weakness of the right or left side
0.874	Transient Ischemic Attack
0.868	Left hemiparesis
0.866	Structure of middle cerebral artery
0.861	Right hemiparesis
0.859	WEAKNESS RESIDUAL
0.859	Cardioembolic Stroke


Epoch 22: 1520it [09:16,  2.73it/s, loss=3582.167]                                                                     


1.000	Ischemic stroke
0.898	Acute Cerebrovascular Accidents
0.885	Acute ischemic stroke subtype
0.865	Left hemiparesis
0.865	WEAKNESS RESIDUAL
0.865	Transient Ischemic Attack
0.864	neurological weakness of the right or left side
0.859	Thalamic infarction
0.858	alteplase
0.856	CANCER WORKUP


Epoch 23: 1520it [10:24,  2.44it/s, loss=3289.185]                                                                     


1.000	Ischemic stroke
0.912	Acute Cerebrovascular Accidents
0.885	Acute ischemic stroke subtype
0.871	Left hemiparesis
0.866	Transient Ischemic Attack
0.862	Structure of middle cerebral artery
0.862	WEAKNESS RESIDUAL
0.861	alteplase
0.860	Right hemiparesis
0.858	Cardioembolic Stroke


Epoch 24: 1520it [10:14,  2.47it/s, loss=3369.398]                                                                     


1.000	Ischemic stroke
0.903	Acute Cerebrovascular Accidents
0.884	Acute ischemic stroke subtype
0.884	Left hemiparesis
0.876	CANCER WORKUP
0.871	WEAKNESS RESIDUAL
0.868	Transient Ischemic Attack
0.864	Right hemiparesis
0.859	Structure of middle cerebral artery
0.858	neurological weakness of the right or left side


Epoch 25: 1520it [10:48,  2.35it/s, loss=3195.872]                                                                     


1.000	Ischemic stroke
0.915	Acute Cerebrovascular Accidents
0.909	Acute ischemic stroke subtype
0.892	WEAKNESS RESIDUAL
0.890	Transient Ischemic Attack
0.878	alteplase
0.872	CVA DISTRIBUTION MCA
0.869	Cardioembolic Stroke
0.867	CANCER WORKUP
0.864	Right hemiparesis


Epoch 26: 1520it [11:19,  2.24it/s, loss=3020.890]                                                                     


1.000	Ischemic stroke
0.909	Acute Cerebrovascular Accidents
0.909	Acute ischemic stroke subtype
0.885	WEAKNESS RESIDUAL
0.876	Transient Ischemic Attack
0.869	neurological weakness of the right or left side
0.866	alteplase
0.866	CVA DISTRIBUTION MCA
0.866	CANCER WORKUP
0.866	Work-up


Epoch 27: 1520it [11:17,  2.24it/s, loss=3269.126]                                                                     


1.000	Ischemic stroke
0.912	Acute Cerebrovascular Accidents
0.909	Acute ischemic stroke subtype
0.893	CANCER WORKUP
0.869	Structure of middle cerebral artery
0.867	WEAKNESS RESIDUAL
0.863	alteplase
0.857	Thalamic infarction
0.856	Work-up
0.856	Left hemiparesis


Epoch 28: 1520it [10:47,  2.35it/s, loss=3263.322]                                                                     


1.000	Ischemic stroke
0.909	Acute ischemic stroke subtype
0.909	Acute Cerebrovascular Accidents
0.872	WEAKNESS RESIDUAL
0.871	Structure of middle cerebral artery
0.865	Thalamic infarction
0.864	Right hemiparesis
0.860	Work-up
0.859	Cardioembolic Stroke
0.858	neurological weakness of the right or left side


Epoch 29: 1520it [10:43,  2.36it/s, loss=2941.257]                                                                     


1.000	Ischemic stroke
0.914	Acute Cerebrovascular Accidents
0.912	Acute ischemic stroke subtype
0.871	WEAKNESS RESIDUAL
0.871	Thalamic infarction
0.866	Work-up
0.865	Left hemiparesis
0.864	neurological weakness of the right or left side
0.864	CANCER WORKUP
0.864	Transient Ischemic Attack


Epoch 30: 1520it [10:04,  2.51it/s, loss=2999.191]                                                                     


1.000	Ischemic stroke
0.916	Acute ischemic stroke subtype
0.914	Acute Cerebrovascular Accidents
0.884	WEAKNESS RESIDUAL
0.881	CANCER WORKUP
0.875	Right hemiparesis
0.873	neurological weakness of the right or left side
0.872	CVA DISTRIBUTION MCA
0.871	Left hemiparesis
0.870	Transient Ischemic Attack


Epoch 31: 1520it [10:36,  2.39it/s, loss=3069.653]                                                                     


1.000	Ischemic stroke
0.917	Acute Cerebrovascular Accidents
0.916	Acute ischemic stroke subtype
0.882	CVA DISTRIBUTION MCA
0.881	Right sided cerebral hemisphere cerebrovascular accident
0.880	Transient Ischemic Attack
0.879	neurological weakness of the right or left side
0.876	WEAKNESS RESIDUAL
0.870	Right hemiparesis
0.869	Cerebrovascular accident


Epoch 32: 1520it [10:40,  2.37it/s, loss=2956.941]                                                                     


1.000	Ischemic stroke
0.925	Acute Cerebrovascular Accidents
0.914	Acute ischemic stroke subtype
0.893	CVA DISTRIBUTION MCA
0.888	neurological weakness of the right or left side
0.888	Transient Ischemic Attack
0.881	WEAKNESS RESIDUAL
0.880	Left hemiparesis
0.878	Right sided cerebral hemisphere cerebrovascular accident
0.876	Thalamic infarction


Epoch 33: 1520it [10:21,  2.44it/s, loss=2839.516]                                                                     


1.000	Ischemic stroke
0.932	Acute Cerebrovascular Accidents
0.917	Acute ischemic stroke subtype
0.894	CVA DISTRIBUTION MCA
0.892	Right sided cerebral hemisphere cerebrovascular accident
0.889	Left hemiparesis
0.888	WEAKNESS RESIDUAL
0.884	CANCER WORKUP
0.881	Thalamic infarction
0.881	neurological weakness of the right or left side


Epoch 34: 1520it [10:36,  2.39it/s, loss=2952.187]                                                                     


1.000	Ischemic stroke
0.927	Acute Cerebrovascular Accidents
0.905	Acute ischemic stroke subtype
0.885	Right sided cerebral hemisphere cerebrovascular accident
0.883	Evaluation
0.882	Transient Ischemic Attack
0.882	Thalamic infarction
0.881	CVA DISTRIBUTION MCA
0.879	CANCER WORKUP
0.878	Cardioembolic Stroke


Epoch 35: 1520it [10:54,  2.32it/s, loss=2896.384]                                                                     


1.000	Ischemic stroke
0.926	Acute Cerebrovascular Accidents
0.910	Acute ischemic stroke subtype
0.894	CVA DISTRIBUTION MCA
0.890	neurological weakness of the right or left side
0.890	CANCER WORKUP
0.889	WEAKNESS RESIDUAL
0.887	Right hemiparesis
0.886	Right sided cerebral hemisphere cerebrovascular accident
0.885	Left hemiparesis


Epoch 36: 1520it [10:13,  2.48it/s, loss=2968.560]                                                                     


1.000	Ischemic stroke
0.923	Acute Cerebrovascular Accidents
0.909	Acute ischemic stroke subtype
0.889	Left hemiparesis
0.888	neurological weakness of the right or left side
0.885	CANCER WORKUP
0.884	Right sided cerebral hemisphere cerebrovascular accident
0.883	CVA DISTRIBUTION MCA
0.882	WEAKNESS RESIDUAL
0.880	Transient Ischemic Attack


Epoch 37: 1520it [10:35,  2.39it/s, loss=2694.494]                                                                     


1.000	Ischemic stroke
0.928	Acute Cerebrovascular Accidents
0.918	Acute ischemic stroke subtype
0.893	Right sided cerebral hemisphere cerebrovascular accident
0.891	Thalamic infarction
0.890	CVA DISTRIBUTION MCA
0.887	Transient Ischemic Attack
0.883	WEAKNESS RESIDUAL
0.871	Cardioembolic Stroke
0.871	Cerebrovascular accident


Epoch 38: 1520it [11:08,  2.27it/s, loss=2839.959]                                                                     


1.000	Ischemic stroke
0.926	Acute Cerebrovascular Accidents
0.923	Acute ischemic stroke subtype
0.891	Right sided cerebral hemisphere cerebrovascular accident
0.886	WEAKNESS RESIDUAL
0.886	CVA DISTRIBUTION MCA
0.886	Thalamic infarction
0.880	Transient Ischemic Attack
0.880	CANCER WORKUP
0.875	Left hemiparesis


Epoch 39: 1520it [10:05,  2.51it/s, loss=2591.413]                                                                     


1.000	Ischemic stroke
0.924	Acute Cerebrovascular Accidents
0.921	Acute ischemic stroke subtype
0.885	WEAKNESS RESIDUAL
0.881	Cerebrovascular accident
0.881	CVA DISTRIBUTION MCA
0.881	CANCER WORKUP
0.880	Right sided cerebral hemisphere cerebrovascular accident
0.880	Transient Ischemic Attack
0.876	Thalamic infarction


Epoch 40: 1520it [10:20,  2.45it/s, loss=2811.326]                                                                     


1.000	Ischemic stroke
0.931	Acute Cerebrovascular Accidents
0.917	Acute ischemic stroke subtype
0.891	Transient Ischemic Attack
0.888	WEAKNESS RESIDUAL
0.885	Right sided cerebral hemisphere cerebrovascular accident
0.879	CVA DISTRIBUTION MCA
0.876	Thalamic infarction
0.871	neurological weakness of the right or left side
0.871	Cerebellar stroke


Epoch 41: 1520it [10:53,  2.32it/s, loss=2591.206]                                                                     


1.000	Ischemic stroke
0.930	Acute Cerebrovascular Accidents
0.921	Acute ischemic stroke subtype
0.893	Right sided cerebral hemisphere cerebrovascular accident
0.891	Transient Ischemic Attack
0.891	CVA DISTRIBUTION MCA
0.890	CANCER WORKUP
0.886	WEAKNESS RESIDUAL
0.881	neurological weakness of the right or left side
0.880	Thalamic infarction


Epoch 42: 1520it [10:42,  2.36it/s, loss=2776.705]                                                                     


1.000	Ischemic stroke
0.932	Acute Cerebrovascular Accidents
0.927	Acute ischemic stroke subtype
0.892	Right sided cerebral hemisphere cerebrovascular accident
0.888	CVA DISTRIBUTION MCA
0.886	Transient Ischemic Attack
0.883	neurological weakness of the right or left side
0.883	WEAKNESS RESIDUAL
0.880	Left hemiparesis
0.878	Cerebellar stroke


Epoch 43: 1520it [09:55,  2.55it/s, loss=2614.458]                                                                     


1.000	Ischemic stroke
0.933	Acute Cerebrovascular Accidents
0.923	Acute ischemic stroke subtype
0.895	Transient Ischemic Attack
0.894	Right sided cerebral hemisphere cerebrovascular accident
0.890	WEAKNESS RESIDUAL
0.888	Left hemiparesis
0.887	CVA DISTRIBUTION MCA
0.886	neurological weakness of the right or left side
0.881	Thalamic infarction


Epoch 44: 1520it [10:20,  2.45it/s, loss=2675.793]                                                                     


1.000	Ischemic stroke
0.935	Acute Cerebrovascular Accidents
0.924	Acute ischemic stroke subtype
0.895	Right sided cerebral hemisphere cerebrovascular accident
0.891	Transient Ischemic Attack
0.890	neurological weakness of the right or left side
0.885	WEAKNESS RESIDUAL
0.883	Cerebrovascular accident
0.882	Cerebellar stroke
0.882	CVA DISTRIBUTION MCA


Epoch 45: 1520it [09:35,  2.64it/s, loss=2692.905]                                                                     


1.000	Ischemic stroke
0.939	Acute Cerebrovascular Accidents
0.934	Acute ischemic stroke subtype
0.909	Right sided cerebral hemisphere cerebrovascular accident
0.898	CVA DISTRIBUTION MCA
0.891	Transient Ischemic Attack
0.888	neurological weakness of the right or left side
0.888	Cerebellar stroke
0.888	CANCER WORKUP
0.886	Left hemiparesis


Epoch 46: 1520it [10:00,  2.53it/s, loss=2646.308]                                                                     


1.000	Ischemic stroke
0.936	Acute Cerebrovascular Accidents
0.926	Acute ischemic stroke subtype
0.897	neurological weakness of the right or left side
0.895	CVA DISTRIBUTION MCA
0.894	Left hemiparesis
0.892	Right sided cerebral hemisphere cerebrovascular accident
0.886	WEAKNESS RESIDUAL
0.885	Cerebellar stroke
0.880	Right hemiparesis


Epoch 47: 1520it [11:03,  2.29it/s, loss=2600.392]                                                                     


1.000	Ischemic stroke
0.936	Acute Cerebrovascular Accidents
0.914	Acute ischemic stroke subtype
0.894	Transient Ischemic Attack
0.892	Right sided cerebral hemisphere cerebrovascular accident
0.891	neurological weakness of the right or left side
0.887	CVA DISTRIBUTION MCA
0.886	WEAKNESS RESIDUAL
0.878	Left hemiparesis
0.876	Right hemiparesis


Epoch 48: 1520it [10:43,  2.36it/s, loss=2501.075]                                                                     


1.000	Ischemic stroke
0.928	Acute Cerebrovascular Accidents
0.914	Acute ischemic stroke subtype
0.886	Right sided cerebral hemisphere cerebrovascular accident
0.886	CANCER WORKUP
0.884	neurological weakness of the right or left side
0.884	WEAKNESS RESIDUAL
0.883	CVA DISTRIBUTION MCA
0.881	Transient Ischemic Attack
0.877	Evaluation


Epoch 49: 1520it [10:23,  2.44it/s, loss=2456.504]                                                                     


1.000	Ischemic stroke
0.929	Acute Cerebrovascular Accidents
0.920	Acute ischemic stroke subtype
0.894	Right sided cerebral hemisphere cerebrovascular accident
0.890	CVA DISTRIBUTION MCA
0.889	WEAKNESS RESIDUAL
0.885	Transient Ischemic Attack
0.883	Cerebellar stroke
0.881	CANCER WORKUP
0.879	neurological weakness of the right or left side


Epoch 50: 1520it [10:55,  2.32it/s, loss=2475.188]                                                                     


1.000	Ischemic stroke
0.931	Acute Cerebrovascular Accidents
0.916	Acute ischemic stroke subtype
0.896	Right sided cerebral hemisphere cerebrovascular accident
0.888	WEAKNESS RESIDUAL
0.887	Right hemiparesis
0.887	Cerebellar stroke
0.887	Transient Ischemic Attack
0.884	Thalamic infarction
0.881	neurological weakness of the right or left side
[50915870.98339844, 36796070.26855469, 31732864.60107422, 28845594.072265625, 26832928.893554688, 25347657.813476562, 24173422.64794922, 23201841.671875, 22371291.514648438, 21653365.00048828, 21013578.751953125, 20448018.418945312, 19910806.518554688, 19436181.10986328, 18981112.2734375, 18577459.768798828, 18193587.456054688, 17817602.975341797, 17505028.23779297, 17173479.548583984, 16884315.399169922, 16590695.16772461, 16326634.245361328, 16075272.467041016, 15836793.860839844, 15610719.074951172, 15379732.203613281, 15179499.887207031, 14978053.685546875, 14792911.081054688, 14611129.380126953, 14435224.587402344, 14270305.335449219, 14098318.

In [17]:
torch.save(model.state_dict(), 'cui_model_11_MAR_2022.pt')

In [18]:
for _, i in enumerate(losses):
    if losses[_-1] < i: #if loss is always monitonically decreasing, should print 0 only
        print(_)

0


In [19]:
with open("cui_vectors_11_MAR_2022.json",'w') as outfile:
    json.dump({key:model.in_embeddings.weight.data[value].tolist() for key,value in vocab.items()}, outfile)

In [15]:
# ToDo: Build document vectors from CUI vector components
docvectors = np.zeros((len(cuidata), EMBEDDING_DIM))
for i,doc in tqdm(enumerate(cuidata), total=len(cuidata)):
    for cui in cuidata[doc]['cuis']:
        docvectors[i] += model.in_embeddings.weight.data[vocab[cui]].numpy()


100%|███████████████████████████████████| 29865/29865 [01:16<00:00, 391.28it/s]


In [21]:
# ToDo: Build document vectors from CUI vector components
docvectors = np.zeros((len(cuidata), EMBEDDING_DIM))
for i,doc in tqdm(enumerate(cuidata), total=len(cuidata)):
    tmpvecs = np.zeros(EMBEDDING_DIM)
    for cui in cuidata[doc]['cuis']:
        tmpvecs += model.in_embeddings.weight.data[vocab[cui]].numpy()
    docvectors[i] = tmpvecs / len(cuidata[doc]['cuis'])

100%|███████████████████████████████████| 29865/29865 [01:07<00:00, 439.69it/s]


In [22]:
with open('average_document_vectors_28_FEB_2022.json','w') as outfile:
    json.dump({x:list(y) for x,y in zip(cuidata.keys(), docvectors)}, outfile)

In [27]:
'''
To load the saved pytorch model:

model = CUIEmbeddingModel(len(vocab), EMBEDDING_DIM) #vocab and embedding dim must match what the model was trained on
model.load_state_dict(torch.load(cui_model_15_NOV_2021.pt))
model.eval() #set to evaluation mode
''';

In [32]:
nearest_embedding_search('C0742946')

1.000	CVA ETIOLOGY HEMORRHAGIC ISCHEMIC
0.847	Acute Cerebrovascular Accidents
0.836	Ischemic stroke
0.829	Left hemiparesis
0.829	Structure of middle cerebral artery
0.822	Thalamic infarction
0.821	Transient Ischemic Attack
0.820	Evaluation
0.815	Acute ischemic stroke subtype
0.815	transient ischemic attack without residual deficits


In [33]:
import xgboost