In [1]:
import os
import json
import torch
import utils
import numpy as np
from tqdm import tqdm
from collections import Counter

In [2]:
#Load the cui data
# cuidata, _ = utils.split_cui_present_absent(True)
DATA_PATH = "P:\ORD_Singh_201911038D\Justin\Stroke_Notes_13OCT21"
with open("sids_to_present_cuis.json",'r') as infile:
    cuidata = json.load(infile)

In [3]:
#print(list(cuidata)[0]) # print the first document ID
print(cuidata[list(cuidata)[0]].keys()) # Just to show the data fields in each document
print(cuidata[list(cuidata)[0]]['cuis'][:10]) # show the first 10 cuis of this specific document

dict_keys(['cuis', 'semtyp'])
['C0008031', 'C0015031', 'C4718442', 'C2707412', 'C0398266', 'C3244243', 'C0917798', 'C0013227', 'C4715678', 'C4715678']


In [4]:
# We will get training pairs. One is the context term the next is the target term.
trainingpairs = []
vocab = dict()
frequency = Counter()
idx = 0

# This could absolutely be accomplished in an alternative / faster way but it's sufficiently fast for now
for doc in tqdm(cuidata):
    cuis = cuidata[doc]['cuis']
    cuiids = []
    for x in cuis:
        if x not in vocab.keys():
            vocab.update({x:idx})
            idx+=1
        cuiids += [vocab[x]]
    for i in range(len(cuiids)-1):
        pair = cuiids[i:i+2]
        if len(pair) < 2:
            continue
        if pair[0] != pair[1]:
            trainingpairs += [pair] #no self references
            frequency.update(pair)
print(trainingpairs[:3])
print(len(trainingpairs))
print(len(vocab))
assert np.all(np.asarray([x for x in vocab.values()]) == np.arange(len(vocab)))
# We'll use an embedding dimension of 50 to start
EMBEDDING_DIM = 100

100%|███████████████████████████████████████████████████████████████████████████| 29865/29865 [00:30<00:00, 986.56it/s]

[[0, 1], [1, 2], [2, 3]]
2879508
101991





In [5]:
frequency = np.asarray([values for key,values in frequency.items()])**0.75
frequency /= np.linalg.norm(frequency, ord=1)
frequency = torch.from_numpy(frequency)
samplingids = torch.arange(0, len(frequency))

In [6]:
class CUIEmbeddingModel(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CUIEmbeddingModel, self).__init__()
        self.dim = embedding_dim
        self.in_embeddings = torch.nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.out_embeddings = torch.nn.Embedding(vocab_size, embedding_dim, sparse=True)
        
    def init(self):
        self.in_embeddings.weight.data.uniform_(-0.5/self.dim, 0.5/self.dim) # scaled by dimensionality to control initial norm.
        self.out_embeddings.weight.data.uniform(-0, 0) # all 0s
        
    def forward(self, inputs, targets, negatives):
        inembed = self.in_embeddings(inputs)
        outembed = self.out_embeddings(targets)
        pos_score = torch.sum(torch.mul(inembed, outembed), dim=1)
        pos_score = torch.nn.functional.logsigmoid(pos_score)
        negembed = self.out_embeddings(negatives)
        neg_score = torch.bmm(negembed, inembed.unsqueeze(2)).squeeze()
        neg_score = torch.nn.functional.logsigmoid(-1*neg_score)
        # skip gram negative sampling
        return -1 * (torch.sum(pos_score)+torch.sum(neg_score))
    

In [7]:
losses = []
model = CUIEmbeddingModel(len(vocab), EMBEDDING_DIM)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
NEGSAMPLES = 2

In [8]:
#Test batching (also gives you an idea of theoretical max throughput in a way)
BATCH_SIZE=2000
for batchidx in tqdm(np.arange(0, len(trainingpairs), BATCH_SIZE), total = len(trainingpairs)//BATCH_SIZE):
    data = trainingpairs[batchidx:batchidx+BATCH_SIZE]
    context_idxs = torch.tensor(data)

1440it [00:01, 769.57it/s]                                                                                             


In [17]:
def nearest_embedding_search(cui):
    #C0948008 = ischemic stroke
    for i in zip(*torch.topk(torch.nn.functional.cosine_similarity(model.in_embeddings.weight.data[vocab[cui]].view(1,-1), 
                                                                   model.in_embeddings.weight.data), 10, largest=True)):
        try:
            print(f'{i[0]:0.3f}\t{cuitranslate[id2vocab[int(i[1])]]}')
        except:
            print("key error", id2vocab[int(i[1])])

In [10]:
id2vocab = {value:key for key,value in vocab.items()}
with open(os.path.join(DATA_PATH, "cuitranslate.json"),'r') as infile:
    cuitranslate = json.load(infile)

In [11]:
print(list(id2vocab.items())[0:10])
print(list(cuitranslate.items())[0:10])
print(cuitranslate["C0000005"])

[(0, 'C0008031'), (1, 'C0015031'), (2, 'C4718442'), (3, 'C2707412'), (4, 'C0398266'), (5, 'C3244243'), (6, 'C0917798'), (7, 'C0013227'), (8, 'C4715678'), (9, 'C0014806')]
[('C0000005', '(131)I-Macroaggregated Albumin'), ('C0000039', '1,2-dipalmitoylphosphatidylcholine'), ('C0000052', '1,4-alpha-Glucan Branching Enzyme'), ('C0000074', '1-Alkyl-2-Acylphosphatidates'), ('C0000084', '1-Carboxyglutamic Acid'), ('C0000096', '1-Methyl-3-isobutylxanthine'), ('C0000097', '1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine'), ('C0000098', '1-Methyl-4-phenylpyridinium'), ('C0000102', '1-Naphthylamine'), ('C0000103', '1-Naphthylisothiocyanate')]
(131)I-Macroaggregated Albumin


In [18]:
trainingpairs = torch.LongTensor(trainingpairs)
for epoch in range(50):
    total_loss = 0
    shuffler = torch.randperm(trainingpairs.shape[0])
    trainingpairs = trainingpairs[shuffler].view(trainingpairs.size())
    with tqdm(np.arange(0, len(trainingpairs), BATCH_SIZE), desc=f'Epoch {epoch+1}', total = len(trainingpairs)//BATCH_SIZE) as progress: #goes one example at a time
        for batchidx in progress:
            data = trainingpairs[batchidx:batchidx+BATCH_SIZE]
            inputs = torch.cat((data[:, 0], data[:,1])) #we'll go bidirectional; usually not done I suppose
            targets = torch.cat((data[:, 1], data[:,0]))#doubles the batch size
            negatives = samplingids[frequency.multinomial(num_samples=inputs.shape[0]*NEGSAMPLES, replacement=True)].reshape(inputs.shape[0], NEGSAMPLES)
            optimizer.zero_grad()
            loss = model.forward(inputs, targets, negatives)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            progress.set_postfix(loss=f"{loss.item():3.3f}")
        losses.append(total_loss)
        progress.set_postfix(loss=f"{total_loss:3.3f}")
    nearest_embedding_search('C0948008')
    
print(losses)


Epoch 1: 1440it [07:18,  3.28it/s, loss=16465.070]                                                                     


1.000	Ischemic stroke
0.390	CANCER WORKUP
0.389	Pancrease
0.388	Non-Palpable
key error microtuition
0.382	Fracture of bone of face
0.381	Lymphadenopathy
0.377	Intravascular ultrasound of artery
0.377	Spinal canal stenosis
0.376	PreNatal Vitamins Plus Tablet


Epoch 2: 1440it [10:13,  2.35it/s, loss=14865.372]                                                                     


1.000	Ischemic stroke
0.497	Acute Cerebrovascular Accidents
0.431	work restrictions decision pending further evaluation
0.422	Poor judgement
0.411	alteplase
0.399	histidine
0.395	Dysarthria
0.393	Plateau
0.390	PreNatal Vitamins Plus Tablet
0.382	Orifice area:Area:Pt:Aortic valve:Qn:US.continuity.VTI+Area


Epoch 3: 1440it [06:51,  3.50it/s, loss=13869.231]                                                                     


1.000	Ischemic stroke
0.633	Acute Cerebrovascular Accidents
0.588	Work-up
0.584	MRAS gene
0.579	Transient Ischemic Attack
0.566	carotid ultrasound doppler
0.563	Cancer/Testis Antigen
0.561	Cerebrovascular accident
0.530	CANCER WORKUP
0.527	Evaluation


Epoch 4: 1440it [06:54,  3.48it/s, loss=12988.998]                                                                     


1.000	Ischemic stroke
0.660	Acute Cerebrovascular Accidents
0.645	decimeter
0.630	Dysarthria
0.623	Atrial Fibrillation
0.621	Peripheral Vascular Diseases
0.617	Cerebrovascular accident
0.616	MRAS gene
0.614	Hyperlipidemia
0.614	Non-ST Elevation Myocardial Infarction by ECG Finding


Epoch 5: 1440it [06:47,  3.53it/s, loss=12217.305]                                                                     


1.000	Ischemic stroke
0.784	Acute Cerebrovascular Accidents
0.723	Evaluation
0.717	X-Ray Computed Tomography
0.702	CANCER WORKUP
0.698	MRAS gene
0.694	Transient Ischemic Attack
0.688	alteplase
0.685	Multisection:Find:Pt:Head>Head vessels & Neck>Neck vessels:Doc:MR.angio
0.675	Peripheral Vascular Diseases


Epoch 6: 1440it [06:35,  3.64it/s, loss=12158.214]                                                                     


1.000	Ischemic stroke
0.782	Acute Cerebrovascular Accidents
0.744	Magnetic Resonance Imaging
0.735	Work-up
0.724	MRAS gene
0.721	Scientific Study
0.715	Transient Ischemic Attack
0.713	Telemetry
0.707	Multisection:Find:Pt:Head>Head vessels & Neck>Neck vessels:Doc:MR.angio
0.707	CANCER WORKUP


Epoch 7: 1440it [06:50,  3.51it/s, loss=11360.330]                                                                     


1.000	Ischemic stroke
0.798	Acute Cerebrovascular Accidents
0.773	Work-up
0.770	Left hemiparesis
0.769	Aphasia
0.767	X-Ray Computed Tomography
0.764	Peripheral Vascular Diseases
0.760	CAD gene
0.759	Evaluation
0.758	Transient Ischemic Attack


Epoch 8: 1440it [06:40,  3.59it/s, loss=11411.921]                                                                     


1.000	Ischemic stroke
0.843	Acute Cerebrovascular Accidents
0.829	CANCER WORKUP
0.818	Work-up
0.811	Aphasia
0.807	Cerebrovascular accident
0.803	Evaluation
0.798	Peripheral Vascular Diseases
0.797	Right hemiparesis
0.790	X-Ray Computed Tomography


Epoch 9: 1440it [06:30,  3.69it/s, loss=10485.597]                                                                     


1.000	Ischemic stroke
0.868	Acute Cerebrovascular Accidents
0.843	Work-up
0.824	Acute ischemic stroke subtype
0.821	CANCER WORKUP
0.807	Aphasia
0.802	Cardioembolic Stroke
0.800	Right hemiparesis
0.800	Transient Ischemic Attack
0.796	WEAKNESS RESIDUAL


Epoch 10: 1440it [06:26,  3.73it/s, loss=10201.874]                                                                    


1.000	Ischemic stroke
0.872	Acute Cerebrovascular Accidents
0.858	CANCER WORKUP
0.852	Acute ischemic stroke subtype
0.829	WEAKNESS RESIDUAL
0.826	Transient Ischemic Attack
0.824	Right hemiparesis
0.823	Peripheral Vascular Diseases
0.822	Cardioembolic Stroke
0.816	Aphasia


Epoch 11: 1440it [06:36,  3.63it/s, loss=10062.685]                                                                    


1.000	Ischemic stroke
0.879	Acute ischemic stroke subtype
0.878	CANCER WORKUP
0.868	Acute Cerebrovascular Accidents
0.861	Work-up
0.853	Transient Ischemic Attack
0.845	Cardioembolic Stroke
0.844	WEAKNESS RESIDUAL
0.841	Thalamic infarction
0.838	Magnetic Resonance Imaging


Epoch 12: 1440it [06:31,  3.68it/s, loss=9667.156]                                                                     


1.000	Ischemic stroke
0.873	CANCER WORKUP
0.865	Acute ischemic stroke subtype
0.862	Acute Cerebrovascular Accidents
0.861	Transient Ischemic Attack
0.855	WEAKNESS RESIDUAL
0.850	Left hemiparesis
0.850	Right hemiparesis
0.844	Thalamic infarction
0.843	Work-up


Epoch 13: 1440it [06:33,  3.66it/s, loss=9269.709]                                                                     


1.000	Ischemic stroke
0.890	Acute Cerebrovascular Accidents
0.879	CANCER WORKUP
0.878	Acute ischemic stroke subtype
0.869	Work-up
0.866	Thalamic infarction
0.862	Cardioembolic Stroke
0.856	Transient Ischemic Attack
0.855	Evaluation
0.847	Aphasia


Epoch 14: 1440it [06:27,  3.71it/s, loss=9141.898]                                                                     


1.000	Ischemic stroke
0.902	Acute Cerebrovascular Accidents
0.893	Acute ischemic stroke subtype
0.885	CANCER WORKUP
0.876	Thalamic infarction
0.869	Work-up
0.861	Cardioembolic Stroke
0.850	WEAKNESS RESIDUAL
0.850	Right hemiparesis
0.850	neurological weakness of the right or left side


Epoch 15: 1440it [06:22,  3.76it/s, loss=9404.676]                                                                     


1.000	Ischemic stroke
0.908	Acute Cerebrovascular Accidents
0.886	Work-up
0.882	Thalamic infarction
0.878	Acute ischemic stroke subtype
0.874	Cardioembolic Stroke
0.870	CANCER WORKUP
0.867	Right hemiparesis
0.862	WEAKNESS RESIDUAL
0.858	Left hemiparesis


Epoch 16: 1440it [06:27,  3.72it/s, loss=9051.633]                                                                     


1.000	Ischemic stroke
0.934	Acute Cerebrovascular Accidents
0.888	WEAKNESS RESIDUAL
0.884	Thalamic infarction
0.879	CANCER WORKUP
0.878	Acute ischemic stroke subtype
0.878	Transient Ischemic Attack
0.868	Right hemiparesis
0.867	Work-up
0.865	neurological weakness of the right or left side


Epoch 17: 1440it [06:28,  3.71it/s, loss=8715.689]                                                                     


1.000	Ischemic stroke
0.920	Acute Cerebrovascular Accidents
0.902	Work-up
0.899	WEAKNESS RESIDUAL
0.897	Acute ischemic stroke subtype
0.888	Transient Ischemic Attack
0.885	Right hemiparesis
0.881	CANCER WORKUP
0.880	Left hemiparesis
0.875	Thalamic infarction


Epoch 18: 1440it [06:47,  3.54it/s, loss=8687.594]                                                                     


1.000	Ischemic stroke
0.922	Acute Cerebrovascular Accidents
0.906	Transient Ischemic Attack
0.906	Acute ischemic stroke subtype
0.895	Right hemiparesis
0.890	Evaluation
0.885	WEAKNESS RESIDUAL
0.881	Thalamic infarction
0.879	Work-up
0.877	CANCER WORKUP


Epoch 19: 1440it [06:22,  3.77it/s, loss=8479.099]                                                                     


1.000	Ischemic stroke
0.929	Acute Cerebrovascular Accidents
0.909	Acute ischemic stroke subtype
0.886	Work-up
0.885	Cardioembolic Stroke
0.885	CANCER WORKUP
0.883	Transient Ischemic Attack
0.876	WEAKNESS RESIDUAL
0.875	Thalamic infarction
0.868	Right hemiparesis


Epoch 20: 1440it [06:43,  3.57it/s, loss=8428.599]                                                                     


1.000	Ischemic stroke
0.915	Acute Cerebrovascular Accidents
0.907	Work-up
0.898	Acute ischemic stroke subtype
0.897	Transient Ischemic Attack
0.884	CANCER WORKUP
0.883	Cardioembolic Stroke
0.881	WEAKNESS RESIDUAL
0.877	Thalamic infarction
0.874	Right hemiparesis


Epoch 21: 1440it [06:37,  3.62it/s, loss=8275.552]                                                                     


1.000	Ischemic stroke
0.923	Acute Cerebrovascular Accidents
0.893	Acute ischemic stroke subtype
0.891	Transient Ischemic Attack
0.883	WEAKNESS RESIDUAL
0.883	Thalamic infarction
0.882	Work-up
0.882	Cardioembolic Stroke
0.882	Right hemiparesis
0.874	Left hemiparesis


Epoch 22: 1440it [07:11,  3.33it/s, loss=8076.372]                                                                     


1.000	Ischemic stroke
0.916	Acute Cerebrovascular Accidents
0.898	Acute ischemic stroke subtype
0.895	WEAKNESS RESIDUAL
0.893	CANCER WORKUP
0.883	Work-up
0.877	Right hemiparesis
0.877	Cardioembolic Stroke
0.874	Transient Ischemic Attack
0.871	Thalamic infarction


Epoch 23: 1440it [06:49,  3.52it/s, loss=8040.249]                                                                     


1.000	Ischemic stroke
0.904	Acute Cerebrovascular Accidents
0.898	Acute ischemic stroke subtype
0.883	WEAKNESS RESIDUAL
0.876	Cardioembolic Stroke
0.866	Work-up
0.866	Right hemiparesis
0.862	CVA DISTRIBUTION MCA
0.861	CANCER WORKUP
0.861	Transient Ischemic Attack


Epoch 24: 1440it [06:41,  3.59it/s, loss=8271.805]                                                                     


1.000	Ischemic stroke
0.931	Acute Cerebrovascular Accidents
0.908	Acute ischemic stroke subtype
0.899	WEAKNESS RESIDUAL
0.888	CANCER WORKUP
0.886	Thalamic infarction
0.885	CVA DISTRIBUTION MCA
0.883	Right hemiparesis
0.881	Cardioembolic Stroke
0.872	Evaluation


Epoch 25: 1440it [07:03,  3.40it/s, loss=7782.830]                                                                     


1.000	Ischemic stroke
0.921	Acute Cerebrovascular Accidents
0.917	Acute ischemic stroke subtype
0.909	CANCER WORKUP
0.907	WEAKNESS RESIDUAL
0.890	Thalamic infarction
0.888	Right hemiparesis
0.880	Cardioembolic Stroke
0.876	Transient Ischemic Attack
0.875	CVA DISTRIBUTION MCA


Epoch 26: 1440it [06:33,  3.66it/s, loss=7527.915]                                                                     


1.000	Ischemic stroke
0.913	Acute Cerebrovascular Accidents
0.903	Acute ischemic stroke subtype
0.895	WEAKNESS RESIDUAL
0.886	Transient Ischemic Attack
0.884	CANCER WORKUP
0.874	Thalamic infarction
0.871	CVA DISTRIBUTION MCA
0.870	Right hemiparesis
0.864	Left hemiparesis


Epoch 27: 1440it [06:42,  3.58it/s, loss=7620.201]                                                                     


1.000	Ischemic stroke
0.916	Acute Cerebrovascular Accidents
0.894	Acute ischemic stroke subtype
0.893	CANCER WORKUP
0.892	WEAKNESS RESIDUAL
0.880	Thalamic infarction
0.878	Transient Ischemic Attack
0.873	Left hemiparesis
0.873	CVA DISTRIBUTION MCA
0.866	Intracranial Hemorrhage


Epoch 28: 1440it [07:03,  3.40it/s, loss=7356.315]                                                                     


1.000	Ischemic stroke
0.918	Acute Cerebrovascular Accidents
0.910	Acute ischemic stroke subtype
0.906	CANCER WORKUP
0.906	WEAKNESS RESIDUAL
0.889	Thalamic infarction
0.880	CVA DISTRIBUTION MCA
0.877	Work-up
0.873	Intracranial Hemorrhage
0.870	Transient Ischemic Attack


Epoch 29: 1440it [07:17,  3.29it/s, loss=7295.637]                                                                     


1.000	Ischemic stroke
0.929	Acute Cerebrovascular Accidents
0.913	WEAKNESS RESIDUAL
0.912	Acute ischemic stroke subtype
0.901	CANCER WORKUP
0.895	Thalamic infarction
0.894	Right hemiparesis
0.889	CVA DISTRIBUTION MCA
0.888	Work-up
0.879	neurological weakness of the right or left side


Epoch 30: 1440it [06:57,  3.45it/s, loss=7111.120]                                                                     


1.000	Ischemic stroke
0.922	Acute Cerebrovascular Accidents
0.905	Acute ischemic stroke subtype
0.901	WEAKNESS RESIDUAL
0.890	CANCER WORKUP
0.890	Left hemiparesis
0.887	Thalamic infarction
0.883	Transient Ischemic Attack
0.878	Work-up
0.878	CVA DISTRIBUTION MCA


Epoch 31: 1440it [06:59,  3.44it/s, loss=6995.442]                                                                     


1.000	Ischemic stroke
0.933	Acute Cerebrovascular Accidents
0.910	Acute ischemic stroke subtype
0.909	WEAKNESS RESIDUAL
0.891	Thalamic infarction
0.891	Left hemiparesis
0.881	CANCER WORKUP
0.881	Work-up
0.880	Intracranial Hemorrhage
0.876	CVA DISTRIBUTION MCA


Epoch 32: 1440it [06:44,  3.56it/s, loss=7182.766]                                                                     


1.000	Ischemic stroke
0.929	Acute Cerebrovascular Accidents
0.914	Acute ischemic stroke subtype
0.907	WEAKNESS RESIDUAL
0.902	Work-up
0.892	Thalamic infarction
0.889	CANCER WORKUP
0.885	Intracranial Hemorrhage
0.879	Left hemiparesis
0.878	Transient Ischemic Attack


Epoch 33: 1440it [07:10,  3.35it/s, loss=6676.755]                                                                     


1.000	Ischemic stroke
0.933	Acute Cerebrovascular Accidents
0.925	Acute ischemic stroke subtype
0.901	WEAKNESS RESIDUAL
0.892	Transient Ischemic Attack
0.890	CANCER WORKUP
0.887	Work-up
0.886	Thalamic infarction
0.880	CVA DISTRIBUTION MCA
0.875	Intracranial Hemorrhage


Epoch 34: 1440it [07:10,  3.35it/s, loss=7155.156]                                                                     


1.000	Ischemic stroke
0.929	Acute Cerebrovascular Accidents
0.928	Acute ischemic stroke subtype
0.899	WEAKNESS RESIDUAL
0.893	CANCER WORKUP
0.889	CVA DISTRIBUTION MCA
0.887	Thalamic infarction
0.884	Evaluation
0.882	Right hemiparesis
0.873	Left hemiparesis


Epoch 35: 1440it [06:46,  3.54it/s, loss=6626.519]                                                                     


1.000	Ischemic stroke
0.931	Acute Cerebrovascular Accidents
0.926	Acute ischemic stroke subtype
0.899	WEAKNESS RESIDUAL
0.893	CANCER WORKUP
0.890	Thalamic infarction
0.887	Transient Ischemic Attack
0.885	CVA DISTRIBUTION MCA
0.882	Intracranial Hemorrhage
0.881	Subacute


Epoch 36: 1440it [06:43,  3.57it/s, loss=6971.212]                                                                     


1.000	Ischemic stroke
0.935	Acute ischemic stroke subtype
0.935	Acute Cerebrovascular Accidents
0.905	WEAKNESS RESIDUAL
0.903	Thalamic infarction
0.897	Transient Ischemic Attack
0.894	CANCER WORKUP
0.893	Work-up
0.889	CVA DISTRIBUTION MCA
0.884	Right hemiparesis


Epoch 37: 1440it [07:19,  3.28it/s, loss=6660.385]                                                                     


1.000	Ischemic stroke
0.934	Acute ischemic stroke subtype
0.932	Acute Cerebrovascular Accidents
0.906	WEAKNESS RESIDUAL
0.903	CANCER WORKUP
0.902	Thalamic infarction
0.892	Work-up
0.891	Left hemiparesis
0.889	Transient Ischemic Attack
0.887	CVA DISTRIBUTION MCA


Epoch 38: 1440it [07:01,  3.41it/s, loss=6587.805]                                                                     


1.000	Ischemic stroke
0.942	Acute Cerebrovascular Accidents
0.928	Acute ischemic stroke subtype
0.917	CANCER WORKUP
0.907	Thalamic infarction
0.904	Work-up
0.895	WEAKNESS RESIDUAL
0.894	CVA DISTRIBUTION MCA
0.891	Intracranial Hemorrhage
0.888	Right hemiparesis


Epoch 39: 1440it [06:54,  3.48it/s, loss=6579.913]                                                                     


1.000	Ischemic stroke
0.930	Acute Cerebrovascular Accidents
0.921	Acute ischemic stroke subtype
0.909	Thalamic infarction
0.904	CANCER WORKUP
0.897	Work-up
0.896	WEAKNESS RESIDUAL
0.895	Right hemiparesis
0.893	CVA DISTRIBUTION MCA
0.888	Transient Ischemic Attack


Epoch 40: 1440it [06:52,  3.49it/s, loss=6685.533]                                                                     


1.000	Ischemic stroke
0.945	Acute Cerebrovascular Accidents
0.933	Acute ischemic stroke subtype
0.913	CANCER WORKUP
0.910	WEAKNESS RESIDUAL
0.905	Thalamic infarction
0.896	Right hemiparesis
0.896	CVA DISTRIBUTION MCA
0.891	Right sided cerebral hemisphere cerebrovascular accident
0.887	Left hemiparesis


Epoch 41: 1440it [07:04,  3.39it/s, loss=6411.663]                                                                     


1.000	Ischemic stroke
0.929	Acute Cerebrovascular Accidents
0.926	Acute ischemic stroke subtype
0.914	WEAKNESS RESIDUAL
0.903	Right hemiparesis
0.901	CANCER WORKUP
0.899	Thalamic infarction
0.892	Transient Ischemic Attack
0.892	Left hemiparesis
0.889	Stroke, Lacunar


Epoch 42: 1440it [07:10,  3.34it/s, loss=6147.222]                                                                     


1.000	Ischemic stroke
0.929	Acute Cerebrovascular Accidents
0.921	Acute ischemic stroke subtype
0.903	CANCER WORKUP
0.901	WEAKNESS RESIDUAL
0.892	Thalamic infarction
0.890	Work-up
0.889	Left hemiparesis
0.887	Right hemiparesis
0.885	Transient Ischemic Attack


Epoch 43: 1440it [06:52,  3.49it/s, loss=6233.389]                                                                     


1.000	Ischemic stroke
0.942	Acute Cerebrovascular Accidents
0.929	Acute ischemic stroke subtype
0.901	CVA DISTRIBUTION MCA
0.899	CANCER WORKUP
0.899	WEAKNESS RESIDUAL
0.899	Stroke, Lacunar
0.898	Thalamic infarction
0.896	Work-up
0.888	Left hemiparesis


Epoch 44: 1440it [07:04,  3.39it/s, loss=6104.941]                                                                     


1.000	Ischemic stroke
0.943	Acute Cerebrovascular Accidents
0.935	Acute ischemic stroke subtype
0.909	WEAKNESS RESIDUAL
0.903	CVA DISTRIBUTION MCA
0.902	Thalamic infarction
0.899	CANCER WORKUP
0.894	Left hemiparesis
0.892	Stroke, Lacunar
0.892	Right hemiparesis


Epoch 45: 1440it [06:57,  3.45it/s, loss=6244.401]                                                                     


1.000	Ischemic stroke
0.934	Acute Cerebrovascular Accidents
0.933	Acute ischemic stroke subtype
0.900	CVA DISTRIBUTION MCA
0.900	Thalamic infarction
0.898	WEAKNESS RESIDUAL
0.893	Stroke, Lacunar
0.889	CANCER WORKUP
0.887	Work-up
0.885	Right hemiparesis


Epoch 46: 1440it [07:11,  3.34it/s, loss=6165.976]                                                                     


1.000	Ischemic stroke
0.931	Acute Cerebrovascular Accidents
0.929	Acute ischemic stroke subtype
0.901	WEAKNESS RESIDUAL
0.898	CVA DISTRIBUTION MCA
0.897	Stroke, Lacunar
0.896	Thalamic infarction
0.893	CANCER WORKUP
0.887	Cerebellar stroke
0.883	Right hemiparesis


Epoch 47: 1440it [06:59,  3.43it/s, loss=6060.653]                                                                     


1.000	Ischemic stroke
0.932	Acute Cerebrovascular Accidents
0.923	Acute ischemic stroke subtype
0.902	WEAKNESS RESIDUAL
0.893	Thalamic infarction
0.891	Stroke, Lacunar
0.890	CVA DISTRIBUTION MCA
0.885	CANCER WORKUP
0.880	Cerebellar stroke
0.879	Intracranial Hemorrhage


Epoch 48: 1440it [06:47,  3.54it/s, loss=6231.445]                                                                     


1.000	Ischemic stroke
0.930	Acute Cerebrovascular Accidents
0.927	Acute ischemic stroke subtype
0.905	WEAKNESS RESIDUAL
0.895	Stroke, Lacunar
0.891	Transient Ischemic Attack
0.891	CVA DISTRIBUTION MCA
0.887	Thalamic infarction
0.886	CANCER WORKUP
0.882	Intracranial Hemorrhage


Epoch 49: 1440it [07:06,  3.37it/s, loss=6090.204]                                                                     


1.000	Ischemic stroke
0.935	Acute Cerebrovascular Accidents
0.923	Acute ischemic stroke subtype
0.908	WEAKNESS RESIDUAL
0.901	CANCER WORKUP
0.897	Stroke, Lacunar
0.895	CVA DISTRIBUTION MCA
0.894	Thalamic infarction
0.889	Left hemiparesis
0.888	Right hemiparesis


Epoch 50: 1440it [07:09,  3.35it/s, loss=5829.967]                                                                     


1.000	Ischemic stroke
0.934	Acute Cerebrovascular Accidents
0.927	Acute ischemic stroke subtype
0.907	WEAKNESS RESIDUAL
0.894	CVA DISTRIBUTION MCA
0.893	Stroke, Lacunar
0.889	Thalamic infarction
0.888	Transient Ischemic Attack
0.886	CANCER WORKUP
0.881	Intracranial Hemorrhage
[48117001.0, 34838506.712890625, 30136581.215820312, 27425331.391601562, 25559103.729492188, 24140458.036132812, 23013291.181640625, 22087911.190429688, 21303551.436523438, 20624172.674804688, 20006899.734375, 19459065.2421875, 18958038.936523438, 18490251.908203125, 18061518.086914062, 17674138.061523438, 17319280.380859375, 16983358.043945312, 16664058.073242188, 16357197.025390625, 16083402.198242188, 15807204.088867188, 15563606.02734375, 15319388.051757812, 15087667.051757812, 14865657.7578125, 14693355.289550781, 14465770.283691406, 14276719.106445312, 14097417.866210938, 13922525.47265625, 13762162.331054688, 13594302.790039062, 13442185.420898438, 13293640.6796875, 13153114.493164062, 13020411.931640625, 1

In [19]:
torch.save(model.state_dict(), 'cui_model_6_MAR_2022.pt')

In [20]:
for _, i in enumerate(losses):
    if losses[_-1] < i: #if loss is always monitonically decreasing, should print 0 only
        print(_)

0


In [21]:
with open("cui_vectors_6_MAR_2022.json",'w') as outfile:
    json.dump({key:model.in_embeddings.weight.data[value].tolist() for key,value in vocab.items()}, outfile)

In [15]:
# ToDo: Build document vectors from CUI vector components
docvectors = np.zeros((len(cuidata), EMBEDDING_DIM))
for i,doc in tqdm(enumerate(cuidata), total=len(cuidata)):
    for cui in cuidata[doc]['cuis']:
        docvectors[i] += model.in_embeddings.weight.data[vocab[cui]].numpy()


100%|███████████████████████████████████| 29865/29865 [01:16<00:00, 391.28it/s]


In [21]:
# ToDo: Build document vectors from CUI vector components
docvectors = np.zeros((len(cuidata), EMBEDDING_DIM))
for i,doc in tqdm(enumerate(cuidata), total=len(cuidata)):
    tmpvecs = np.zeros(EMBEDDING_DIM)
    for cui in cuidata[doc]['cuis']:
        tmpvecs += model.in_embeddings.weight.data[vocab[cui]].numpy()
    docvectors[i] = tmpvecs / len(cuidata[doc]['cuis'])

100%|███████████████████████████████████| 29865/29865 [01:07<00:00, 439.69it/s]


In [22]:
with open('average_document_vectors_28_FEB_2022.json','w') as outfile:
    json.dump({x:list(y) for x,y in zip(cuidata.keys(), docvectors)}, outfile)

In [27]:
'''
To load the saved pytorch model:

model = CUIEmbeddingModel(len(vocab), EMBEDDING_DIM) #vocab and embedding dim must match what the model was trained on
model.load_state_dict(torch.load(cui_model_15_NOV_2021.pt))
model.eval() #set to evaluation mode
''';

In [32]:
nearest_embedding_search('C0742946')

1.000	CVA ETIOLOGY HEMORRHAGIC ISCHEMIC
0.847	Acute Cerebrovascular Accidents
0.836	Ischemic stroke
0.829	Left hemiparesis
0.829	Structure of middle cerebral artery
0.822	Thalamic infarction
0.821	Transient Ischemic Attack
0.820	Evaluation
0.815	Acute ischemic stroke subtype
0.815	transient ischemic attack without residual deficits


In [33]:
import xgboost