In [1]:
import os
import json
import torch
import numpy as np
from tqdm import tqdm
from collections import Counter

In [2]:
#Load the cui data
# os.chdir('P:\ORD_Singh_201911038D\Justin\Stroke_Notes_13OCT21') #make sure we're in the right place
DATA_PATH = "P:\ORD_Singh_201911038D\Justin\Stroke_Notes_13OCT21"
with open(os.path.join(DATA_PATH, "sids_to_clamp_cuis.json"),'r') as infile:
    cuidata = json.load(infile)

In [3]:
#print(list(cuidata)[0]) # print the first document ID
print(cuidata[list(cuidata)[0]].keys()) # Just to show the data fields in each document
print(cuidata[list(cuidata)[0]]['cuis'][:10]) # show the first 10 cuis of this specific document

dict_keys(['cuis', 'semtype', 'presence'])
['C0008031', 'C1507320', 'C0015031', 'C4718442', 'C2707412', 'C0281822', 'C0043250', 'C0281822', 'C0398266', 'C3244243']


In [4]:
# We will get training pairs. One is the context term the next is the target term.
trainingpairs = []
vocab = dict()
frequency = Counter()
idx = 0

# This could absolutely be accomplished in an alternative / faster way but it's sufficiently fast for now
for doc in tqdm(cuidata):
    cuis = cuidata[doc]['cuis']
    cuiids = []
    for x in cuis:
        if x not in vocab.keys():
            vocab.update({x:idx})
            idx+=1
        cuiids += [vocab[x]]
    for i in range(len(cuiids)-1):
        pair = cuiids[i:i+2]
        if len(pair) < 2:
            continue
        if pair[0] != pair[1]:
            trainingpairs += [pair] #no self references
            frequency.update(pair)
print(trainingpairs[:3])
print(len(trainingpairs))
print(len(vocab))
assert np.all(np.asarray([x for x in vocab.values()]) == np.arange(len(vocab)))
# We'll use an embedding dimension of 50 to start
EMBEDDING_DIM = 100

100%|███████████████████████████████████████████████████████████████████████████| 29865/29865 [00:34<00:00, 866.05it/s]

[[0, 1], [1, 2], [2, 3]]
3526722
111303





In [5]:
frequency = np.asarray([values for key,values in frequency.items()])**0.75
frequency /= np.linalg.norm(frequency, ord=1)
frequency = torch.from_numpy(frequency)
samplingids = torch.arange(0, len(frequency))

In [6]:
class CUIEmbeddingModel(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CUIEmbeddingModel, self).__init__()
        self.dim = embedding_dim
        self.in_embeddings = torch.nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.out_embeddings = torch.nn.Embedding(vocab_size, embedding_dim, sparse=True)
        
    def init(self):
        self.in_embeddings.weight.data.uniform_(-0.5/self.dim, 0.5/self.dim) # scaled by dimensionality to control initial norm.
        self.out_embeddings.weight.data.uniform(-0, 0) # all 0s
        
    def forward(self, inputs, targets, negatives):
        inembed = self.in_embeddings(inputs)
        outembed = self.out_embeddings(targets)
        pos_score = torch.sum(torch.mul(inembed, outembed), dim=1)
        pos_score = torch.nn.functional.logsigmoid(pos_score)
        negembed = self.out_embeddings(negatives)
        neg_score = torch.bmm(negembed, inembed.unsqueeze(2)).squeeze()
        neg_score = torch.nn.functional.logsigmoid(-1*neg_score)
        return -1 * (torch.sum(pos_score)+torch.sum(neg_score))
    

In [7]:
losses = []
model = CUIEmbeddingModel(len(vocab), EMBEDDING_DIM)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
NEGSAMPLES = 2

In [8]:
#Test batching (also gives you an idea of theoretical max throughput in a way)
BATCH_SIZE=2000
for batchidx in tqdm(np.arange(0, len(trainingpairs), BATCH_SIZE), total = len(trainingpairs)//BATCH_SIZE):
    data = trainingpairs[batchidx:batchidx+BATCH_SIZE]
    context_idxs = torch.tensor(data)

1764it [00:01, 930.75it/s]                                                                                             


In [9]:
def nearest_embedding_search(cui):
    #C0948008 = ischemic stroke
    for i in zip(*torch.topk(torch.nn.functional.cosine_similarity(model.in_embeddings.weight.data[vocab[cui]].view(1,-1), 
                                                                   model.in_embeddings.weight.data), 10, largest=True)):
        print(f'{i[0]:0.3f}\t{cuitranslate[id2vocab[int(i[1])]]}')

In [10]:
id2vocab = {value:key for key,value in vocab.items()}
with open(os.path.join(DATA_PATH, "cuitranslate.json"),'r') as infile:
    cuitranslate = json.load(infile)

In [11]:
print(list(id2vocab.items())[0:10])
print(list(cuitranslate.items())[0:10])
print(cuitranslate["C0000005"])

[(0, 'C0008031'), (1, 'C1507320'), (2, 'C0015031'), (3, 'C4718442'), (4, 'C2707412'), (5, 'C0281822'), (6, 'C0043250'), (7, 'C0398266'), (8, 'C3244243'), (9, 'C0018802')]
[('C0000005', '(131)I-Macroaggregated Albumin'), ('C0000039', '1,2-dipalmitoylphosphatidylcholine'), ('C0000052', '1,4-alpha-Glucan Branching Enzyme'), ('C0000074', '1-Alkyl-2-Acylphosphatidates'), ('C0000084', '1-Carboxyglutamic Acid'), ('C0000096', '1-Methyl-3-isobutylxanthine'), ('C0000097', '1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine'), ('C0000098', '1-Methyl-4-phenylpyridinium'), ('C0000102', '1-Naphthylamine'), ('C0000103', '1-Naphthylisothiocyanate')]
(131)I-Macroaggregated Albumin


In [12]:
trainingpairs = torch.LongTensor(trainingpairs)
for epoch in range(50):
    total_loss = 0
    shuffler = torch.randperm(trainingpairs.shape[0])
    trainingpairs = trainingpairs[shuffler].view(trainingpairs.size())
    with tqdm(np.arange(0, len(trainingpairs), BATCH_SIZE), desc=f'Epoch {epoch+1}', total = len(trainingpairs)//BATCH_SIZE) as progress: #goes one example at a time
        for batchidx in progress:
            data = trainingpairs[batchidx:batchidx+BATCH_SIZE]
            inputs = torch.cat((data[:, 0], data[:,1])) #we'll go bidirectional; usually not done I suppose
            targets = torch.cat((data[:, 1], data[:,0]))#doubles the batch size
            negatives = samplingids[frequency.multinomial(num_samples=inputs.shape[0]*NEGSAMPLES, replacement=True)].reshape(inputs.shape[0], NEGSAMPLES)
            optimizer.zero_grad()
            loss = model.forward(inputs, targets, negatives)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            progress.set_postfix(loss=f"{loss.item():3.3f}")
        losses.append(total_loss)
        progress.set_postfix(loss=f"{total_loss:3.3f}")
    nearest_embedding_search('C0948008')
    
print(losses)


Epoch 1: 1764it [07:58,  3.68it/s, loss=9492.232]                                                                      


1.000	Ischemic stroke
0.415	Pathological fracture of left rib
0.386	spindle cell rhabdomyosarcoma of soft tissue
0.375	Trimalleolar Fractures
0.373	Self-expanding stent
0.372	Bundle of His
0.369	magnetic resonance imaging of knee: non-displaced fracture
0.369	pomalidomide
0.364	Cisternal tap
0.357	Color Blindness, Blue


Epoch 2: 1764it [08:10,  3.60it/s, loss=7536.564]                                                                      


1.000	Ischemic stroke
0.391	Pathological fracture of left rib
0.385	Trimalleolar Fractures
0.384	pomalidomide
0.376	spindle cell rhabdomyosarcoma of soft tissue
0.374	Femoracetabular Impingement
0.368	Guidance for cryoablation:Find:Pt:Chest>Lung:Doc:{Imaging modality}
0.365	Pertinent information
0.360	CAT scan of head
0.359	stress test-induced anxiety


Epoch 3: 1764it [08:04,  3.64it/s, loss=6796.781]                                                                      


1.000	Ischemic stroke
0.503	Diabetes Mellitus, Non-Insulin-Dependent
0.492	Transient Ischemic Attack
0.488	Acute Cerebrovascular Accidents
0.487	Imagent
0.483	square decimeter
0.482	Evaluation
0.479	Hypertensive disease
0.478	CAD gene
0.476	X-Ray Computed Tomography


Epoch 4: 1764it [08:06,  3.63it/s, loss=6326.548]                                                                      


1.000	Ischemic stroke
0.693	Acute Cerebrovascular Accidents
0.667	Transient Ischemic Attack
0.666	CANCER WORKUP
0.665	work restrictions decision pending further evaluation
0.663	Left hemiparesis
0.634	Evaluation
0.632	Rehabilitation therapy
0.632	Atrial Fibrillation
0.628	Magnetic Resonance Imaging


Epoch 5: 1764it [07:40,  3.83it/s, loss=6165.735]                                                                      


1.000	Ischemic stroke
0.772	Acute Cerebrovascular Accidents
0.733	CANCER WORKUP
0.732	Evaluation
0.698	Left hemiparesis
0.688	Magnetic Resonance Imaging
0.686	Rehabilitation therapy
0.685	Transient Ischemic Attack
0.678	Telemetry
0.673	Deep Vein Thrombosis


Epoch 6: 1764it [07:49,  3.75it/s, loss=5586.606]                                                                      


1.000	Ischemic stroke
0.809	Transient Ischemic Attack
0.772	Acute Cerebrovascular Accidents
0.760	CANCER WORKUP
0.754	Evaluation
0.719	Cerebrovascular accident
0.717	Atrial Fibrillation
0.699	Carcinoembryonic Antigen
0.698	Right hemiparesis
0.697	Magnetic resonance imaging of brain and brain stem


Epoch 7: 1764it [08:11,  3.59it/s, loss=5586.883]                                                                      


1.000	Ischemic stroke
0.842	Transient Ischemic Attack
0.800	Acute Cerebrovascular Accidents
0.798	Evaluation
0.797	Acute ischemic stroke subtype
0.780	CANCER WORKUP
0.755	Work-up
0.753	Magnetic resonance imaging of brain and brain stem
0.752	work restrictions decision pending further evaluation
0.751	CAT scan of head


Epoch 8: 1764it [08:25,  3.49it/s, loss=5135.917]                                                                      


1.000	Ischemic stroke
0.838	Acute Cerebrovascular Accidents
0.820	CANCER WORKUP
0.818	Work-up
0.817	Evaluation
0.802	Transient Ischemic Attack
0.793	Acute ischemic stroke subtype
0.792	Right hemiparesis
0.788	Structure of middle cerebral artery
0.786	Left hemiparesis


Epoch 9: 1764it [08:26,  3.48it/s, loss=4770.896]                                                                      


1.000	Ischemic stroke
0.853	Acute Cerebrovascular Accidents
0.847	CANCER WORKUP
0.839	Transient Ischemic Attack
0.838	Acute ischemic stroke subtype
0.820	Evaluation
0.820	Left hemiparesis
0.819	WEAKNESS RESIDUAL
0.808	Cerebrovascular accident
0.805	Further


Epoch 10: 1764it [07:59,  3.68it/s, loss=4861.857]                                                                     


1.000	Ischemic stroke
0.888	Acute Cerebrovascular Accidents
0.867	Acute ischemic stroke subtype
0.866	Transient Ischemic Attack
0.856	WEAKNESS RESIDUAL
0.848	CANCER WORKUP
0.835	Evaluation
0.830	Intervention regimes
0.823	Structure of middle cerebral artery
0.823	Left hemiparesis


Epoch 11: 1764it [08:13,  3.57it/s, loss=4787.319]                                                                     


1.000	Ischemic stroke
0.891	Acute ischemic stroke subtype
0.883	Acute Cerebrovascular Accidents
0.872	Transient Ischemic Attack
0.858	CANCER WORKUP
0.855	Intervention regimes
0.850	Cardioembolic Stroke
0.849	WEAKNESS RESIDUAL
0.839	Left hemiparesis
0.835	Cerebrovascular accident


Epoch 12: 1764it [08:13,  3.57it/s, loss=4528.994]                                                                     


1.000	Ischemic stroke
0.901	Acute Cerebrovascular Accidents
0.887	Acute ischemic stroke subtype
0.866	Transient Ischemic Attack
0.856	Work-up
0.855	WEAKNESS RESIDUAL
0.850	Evaluation
0.846	Cardioembolic Stroke
0.846	CANCER WORKUP
0.836	Thalamic infarction


Epoch 13: 1764it [07:42,  3.82it/s, loss=4676.262]                                                                     


1.000	Ischemic stroke
0.906	Acute Cerebrovascular Accidents
0.893	Acute ischemic stroke subtype
0.877	CANCER WORKUP
0.858	Transient Ischemic Attack
0.857	Evaluation
0.852	Thalamic infarction
0.851	Cardioembolic Stroke
0.841	Magnetic Resonance Imaging
0.839	WEAKNESS RESIDUAL


Epoch 14: 1764it [07:58,  3.69it/s, loss=4104.867]                                                                     


1.000	Ischemic stroke
0.907	Acute Cerebrovascular Accidents
0.902	Acute ischemic stroke subtype
0.882	CANCER WORKUP
0.877	Transient Ischemic Attack
0.871	Thalamic infarction
0.858	Evaluation
0.850	Work-up
0.848	Cardioembolic Stroke
0.847	Magnetic Resonance Imaging


Epoch 15: 1764it [08:11,  3.59it/s, loss=4182.456]                                                                     


1.000	Ischemic stroke
0.913	Acute Cerebrovascular Accidents
0.894	Acute ischemic stroke subtype
0.887	Transient Ischemic Attack
0.886	CANCER WORKUP
0.878	Thalamic infarction
0.875	WEAKNESS RESIDUAL
0.866	Evaluation
0.855	Cardioembolic Stroke
0.846	Left hemiparesis


Epoch 16: 1764it [08:17,  3.55it/s, loss=3952.848]                                                                     


1.000	Ischemic stroke
0.916	Acute Cerebrovascular Accidents
0.913	Acute ischemic stroke subtype
0.890	CANCER WORKUP
0.883	Evaluation
0.881	Thalamic infarction
0.880	WEAKNESS RESIDUAL
0.874	Transient Ischemic Attack
0.865	Cardioembolic Stroke
0.855	Cerebrovascular accident


Epoch 17: 1764it [07:59,  3.68it/s, loss=4002.738]                                                                     


1.000	Ischemic stroke
0.927	Acute ischemic stroke subtype
0.919	Acute Cerebrovascular Accidents
0.888	CANCER WORKUP
0.884	Transient Ischemic Attack
0.876	Cardioembolic Stroke
0.875	Neurology speciality
0.871	WEAKNESS RESIDUAL
0.870	Work-up
0.869	Thalamic infarction


Epoch 18: 1764it [08:14,  3.56it/s, loss=3977.569]                                                                     


1.000	Ischemic stroke
0.915	Acute Cerebrovascular Accidents
0.905	Acute ischemic stroke subtype
0.883	Transient Ischemic Attack
0.882	WEAKNESS RESIDUAL
0.879	CANCER WORKUP
0.879	Thalamic infarction
0.866	Work-up
0.864	Cardioembolic Stroke
0.863	Subacute


Epoch 19: 1764it [08:09,  3.61it/s, loss=3723.207]                                                                     


1.000	Ischemic stroke
0.914	Acute Cerebrovascular Accidents
0.912	Acute ischemic stroke subtype
0.888	WEAKNESS RESIDUAL
0.883	Thalamic infarction
0.881	Transient Ischemic Attack
0.879	CANCER WORKUP
0.874	Subacute
0.872	Cardioembolic Stroke
0.861	Work-up


Epoch 20: 1764it [07:56,  3.70it/s, loss=3656.540]                                                                     


1.000	Ischemic stroke
0.936	Acute Cerebrovascular Accidents
0.914	Acute ischemic stroke subtype
0.904	CANCER WORKUP
0.896	Thalamic infarction
0.886	Subacute
0.886	WEAKNESS RESIDUAL
0.875	Evaluation
0.874	Cardioembolic Stroke
0.871	Stroke in the puerperium


Epoch 21: 1764it [08:04,  3.64it/s, loss=3733.581]                                                                     


1.000	Ischemic stroke
0.932	Acute Cerebrovascular Accidents
0.918	Acute ischemic stroke subtype
0.910	CANCER WORKUP
0.899	Thalamic infarction
0.896	Subacute
0.890	WEAKNESS RESIDUAL
0.884	Evaluation
0.881	Cardioembolic Stroke
0.881	Cerebellar stroke


Epoch 22: 1764it [08:08,  3.61it/s, loss=3654.685]                                                                     


1.000	Ischemic stroke
0.926	Acute Cerebrovascular Accidents
0.919	Acute ischemic stroke subtype
0.900	CANCER WORKUP
0.898	Subacute
0.897	WEAKNESS RESIDUAL
0.896	Thalamic infarction
0.893	Transient Ischemic Attack
0.883	Cerebellar stroke
0.880	Brain hemorrhage


Epoch 23: 1764it [07:57,  3.69it/s, loss=3662.129]                                                                     


1.000	Ischemic stroke
0.928	Acute Cerebrovascular Accidents
0.917	Acute ischemic stroke subtype
0.909	WEAKNESS RESIDUAL
0.901	Thalamic infarction
0.900	Cardioembolic Stroke
0.898	CANCER WORKUP
0.896	Cerebellar stroke
0.889	Subacute
0.885	Work-up


Epoch 24: 1764it [08:11,  3.59it/s, loss=3833.920]                                                                     


1.000	Ischemic stroke
0.927	Acute Cerebrovascular Accidents
0.923	Acute ischemic stroke subtype
0.912	CANCER WORKUP
0.901	Thalamic infarction
0.896	Cerebellar stroke
0.894	Cardioembolic Stroke
0.891	WEAKNESS RESIDUAL
0.888	Transient Ischemic Attack
0.882	Work-up


Epoch 25: 1764it [08:28,  3.47it/s, loss=3385.498]                                                                     


1.000	Ischemic stroke
0.931	Acute Cerebrovascular Accidents
0.918	Acute ischemic stroke subtype
0.905	Thalamic infarction
0.904	CANCER WORKUP
0.904	WEAKNESS RESIDUAL
0.904	Cerebellar stroke
0.895	Embolic Stroke
0.893	Evaluation
0.892	Cardioembolic Stroke


Epoch 26: 1764it [08:23,  3.51it/s, loss=3520.954]                                                                     


1.000	Ischemic stroke
0.937	Acute Cerebrovascular Accidents
0.923	Acute ischemic stroke subtype
0.909	Cerebellar stroke
0.905	CANCER WORKUP
0.901	Thalamic infarction
0.897	WEAKNESS RESIDUAL
0.886	Subacute
0.884	transient ischemic attack without residual deficits
0.882	Embolic Stroke


Epoch 27: 1764it [07:55,  3.71it/s, loss=3460.953]                                                                     


1.000	Ischemic stroke
0.924	Acute Cerebrovascular Accidents
0.917	Acute ischemic stroke subtype
0.905	Cerebellar stroke
0.905	CANCER WORKUP
0.901	WEAKNESS RESIDUAL
0.892	transient ischemic attack without residual deficits
0.891	Transient Ischemic Attack
0.890	Thalamic infarction
0.879	Subacute


Epoch 28: 1764it [08:06,  3.63it/s, loss=3458.620]                                                                     


1.000	Ischemic stroke
0.933	Acute Cerebrovascular Accidents
0.920	Acute ischemic stroke subtype
0.915	Cerebellar stroke
0.907	Thalamic infarction
0.905	CANCER WORKUP
0.892	WEAKNESS RESIDUAL
0.892	Transient Ischemic Attack
0.890	transient ischemic attack without residual deficits
0.887	CVA DISTRIBUTION MCA


Epoch 29: 1764it [08:16,  3.55it/s, loss=3544.070]                                                                     


1.000	Ischemic stroke
0.935	Acute Cerebrovascular Accidents
0.921	Acute ischemic stroke subtype
0.907	Cerebellar stroke
0.906	WEAKNESS RESIDUAL
0.904	Thalamic infarction
0.900	transient ischemic attack without residual deficits
0.899	CVA DISTRIBUTION MCA
0.893	Transient Ischemic Attack
0.888	Evaluation


Epoch 30: 1764it [08:31,  3.45it/s, loss=3290.292]                                                                     


1.000	Ischemic stroke
0.941	Acute Cerebrovascular Accidents
0.918	Acute ischemic stroke subtype
0.904	transient ischemic attack without residual deficits
0.904	Cerebellar stroke
0.903	Thalamic infarction
0.901	WEAKNESS RESIDUAL
0.897	Embolic Stroke
0.896	CVA DISTRIBUTION MCA
0.891	Transient Ischemic Attack


Epoch 31: 1764it [08:14,  3.56it/s, loss=3328.696]                                                                     


1.000	Ischemic stroke
0.937	Acute Cerebrovascular Accidents
0.923	Acute ischemic stroke subtype
0.902	Cerebellar stroke
0.902	CANCER WORKUP
0.900	Thalamic infarction
0.900	transient ischemic attack without residual deficits
0.896	CVA DISTRIBUTION MCA
0.887	WEAKNESS RESIDUAL
0.885	Embolic Stroke


Epoch 32: 1764it [07:59,  3.68it/s, loss=3353.450]                                                                     


1.000	Ischemic stroke
0.936	Acute Cerebrovascular Accidents
0.925	Acute ischemic stroke subtype
0.904	Thalamic infarction
0.901	Cerebellar stroke
0.897	transient ischemic attack without residual deficits
0.890	CANCER WORKUP
0.886	CVA DISTRIBUTION MCA
0.883	WEAKNESS RESIDUAL
0.882	Right sided cerebral hemisphere cerebrovascular accident


Epoch 33: 1764it [08:38,  3.40it/s, loss=3179.852]                                                                     


1.000	Ischemic stroke
0.930	Acute Cerebrovascular Accidents
0.923	Acute ischemic stroke subtype
0.902	Cerebellar stroke
0.899	Thalamic infarction
0.889	transient ischemic attack without residual deficits
0.887	CVA DISTRIBUTION MCA
0.886	Embolic Stroke
0.881	WEAKNESS RESIDUAL
0.880	Transient Ischemic Attack


Epoch 34: 1764it [08:58,  3.28it/s, loss=2886.509]                                                                     


1.000	Ischemic stroke
0.936	Acute Cerebrovascular Accidents
0.926	Acute ischemic stroke subtype
0.904	Cerebellar stroke
0.895	Thalamic infarction
0.894	CANCER WORKUP
0.891	transient ischemic attack without residual deficits
0.891	WEAKNESS RESIDUAL
0.889	Subacute
0.888	Embolic Stroke


Epoch 35: 1764it [09:42,  3.03it/s, loss=3193.124]                                                                     


1.000	Ischemic stroke
0.936	Acute Cerebrovascular Accidents
0.931	Acute ischemic stroke subtype
0.910	Cerebellar stroke
0.903	Embolic Stroke
0.903	CANCER WORKUP
0.898	Thalamic infarction
0.896	Subacute
0.894	CVA DISTRIBUTION MCA
0.890	WEAKNESS RESIDUAL


Epoch 36: 1764it [09:17,  3.17it/s, loss=3138.101]                                                                     


1.000	Ischemic stroke
0.939	Acute Cerebrovascular Accidents
0.927	Acute ischemic stroke subtype
0.911	Embolic Stroke
0.910	Cerebellar stroke
0.900	transient ischemic attack without residual deficits
0.896	WEAKNESS RESIDUAL
0.895	Right sided cerebral hemisphere cerebrovascular accident
0.894	Thalamic infarction
0.886	Transient Ischemic Attack


Epoch 37: 1764it [09:27,  3.11it/s, loss=2929.475]                                                                     


1.000	Ischemic stroke
0.942	Acute Cerebrovascular Accidents
0.920	Acute ischemic stroke subtype
0.909	Cerebellar stroke
0.904	Embolic Stroke
0.902	transient ischemic attack without residual deficits
0.902	Transient Ischemic Attack
0.895	WEAKNESS RESIDUAL
0.894	Thalamic infarction
0.892	Right sided cerebral hemisphere cerebrovascular accident


Epoch 38: 1764it [09:07,  3.22it/s, loss=3037.448]                                                                     


1.000	Ischemic stroke
0.938	Acute Cerebrovascular Accidents
0.922	Acute ischemic stroke subtype
0.913	Cerebellar stroke
0.898	Thalamic infarction
0.898	Embolic Stroke
0.896	transient ischemic attack without residual deficits
0.896	CANCER WORKUP
0.890	Subacute
0.889	WEAKNESS RESIDUAL


Epoch 39: 1764it [09:05,  3.23it/s, loss=3060.323]                                                                     


1.000	Ischemic stroke
0.936	Acute Cerebrovascular Accidents
0.918	Acute ischemic stroke subtype
0.910	Cerebellar stroke
0.902	Embolic Stroke
0.900	CANCER WORKUP
0.899	transient ischemic attack without residual deficits
0.898	Subacute
0.896	Thalamic infarction
0.893	Transient Ischemic Attack


Epoch 40: 1764it [13:23,  2.20it/s, loss=2940.464]                                                                     


1.000	Ischemic stroke
0.943	Acute Cerebrovascular Accidents
0.916	Acute ischemic stroke subtype
0.915	Cerebellar stroke
0.898	Thalamic infarction
0.893	transient ischemic attack without residual deficits
0.893	Right sided cerebral hemisphere cerebrovascular accident
0.891	Cardioembolic Stroke
0.890	Embolic Stroke
0.890	Transient Ischemic Attack


Epoch 41: 1764it [13:56,  2.11it/s, loss=2814.748]                                                                     


1.000	Ischemic stroke
0.941	Acute Cerebrovascular Accidents
0.925	Acute ischemic stroke subtype
0.917	Cerebellar stroke
0.903	Thalamic infarction
0.893	Subacute
0.892	transient ischemic attack without residual deficits
0.891	Right sided cerebral hemisphere cerebrovascular accident
0.890	Cardioembolic Stroke
0.890	CANCER WORKUP


Epoch 42: 1764it [12:26,  2.36it/s, loss=2806.391]                                                                     


1.000	Ischemic stroke
0.937	Acute Cerebrovascular Accidents
0.923	Acute ischemic stroke subtype
0.921	Cerebellar stroke
0.907	Thalamic infarction
0.895	Transient Ischemic Attack
0.895	Subacute
0.891	transient ischemic attack without residual deficits
0.891	WEAKNESS RESIDUAL
0.890	Embolic Stroke


Epoch 43: 1764it [12:27,  2.36it/s, loss=2860.461]                                                                     


1.000	Ischemic stroke
0.934	Acute Cerebrovascular Accidents
0.924	Cerebellar stroke
0.922	Acute ischemic stroke subtype
0.915	Thalamic infarction
0.894	Subacute
0.894	transient ischemic attack without residual deficits
0.892	WEAKNESS RESIDUAL
0.887	Stroke, Lacunar
0.885	Embolic Stroke


Epoch 44: 1764it [12:13,  2.41it/s, loss=2877.446]                                                                     


1.000	Ischemic stroke
0.936	Acute Cerebrovascular Accidents
0.925	Acute ischemic stroke subtype
0.922	Cerebellar stroke
0.907	Thalamic infarction
0.905	WEAKNESS RESIDUAL
0.902	transient ischemic attack without residual deficits
0.900	Transient Ischemic Attack
0.893	Embolic Stroke
0.891	Stroke, Lacunar


Epoch 45: 1764it [12:19,  2.39it/s, loss=2835.276]                                                                     


1.000	Ischemic stroke
0.938	Acute Cerebrovascular Accidents
0.928	Acute ischemic stroke subtype
0.917	Cerebellar stroke
0.904	Thalamic infarction
0.901	WEAKNESS RESIDUAL
0.898	transient ischemic attack without residual deficits
0.894	Stroke, Lacunar
0.894	CANCER WORKUP
0.892	Embolic Stroke


Epoch 46: 1764it [11:39,  2.52it/s, loss=2731.224]                                                                     


1.000	Ischemic stroke
0.942	Acute Cerebrovascular Accidents
0.923	Acute ischemic stroke subtype
0.915	Cerebellar stroke
0.902	Thalamic infarction
0.900	WEAKNESS RESIDUAL
0.893	Stroke, Lacunar
0.893	transient ischemic attack without residual deficits
0.892	Embolic Stroke
0.886	Transient Ischemic Attack


Epoch 47: 1764it [11:47,  2.49it/s, loss=2727.372]                                                                     


1.000	Ischemic stroke
0.948	Acute Cerebrovascular Accidents
0.924	Acute ischemic stroke subtype
0.921	Cerebellar stroke
0.899	Thalamic infarction
0.898	WEAKNESS RESIDUAL
0.894	Transient Ischemic Attack
0.894	CANCER WORKUP
0.894	Embolic Stroke
0.893	transient ischemic attack without residual deficits


Epoch 48: 1764it [12:09,  2.42it/s, loss=2717.641]                                                                     


1.000	Ischemic stroke
0.940	Acute Cerebrovascular Accidents
0.922	Acute ischemic stroke subtype
0.913	Cerebellar stroke
0.901	Thalamic infarction
0.892	transient ischemic attack without residual deficits
0.889	Embolic Stroke
0.887	Stroke, Lacunar
0.887	Transient Ischemic Attack
0.886	WEAKNESS RESIDUAL


Epoch 49: 1764it [12:21,  2.38it/s, loss=2524.348]                                                                     


1.000	Ischemic stroke
0.941	Acute Cerebrovascular Accidents
0.914	Acute ischemic stroke subtype
0.911	Cerebellar stroke
0.900	Thalamic infarction
0.894	transient ischemic attack without residual deficits
0.892	CANCER WORKUP
0.890	Embolic Stroke
0.886	Right sided cerebral hemisphere cerebrovascular accident
0.886	Stroke, Lacunar


Epoch 50: 1764it [11:14,  2.62it/s, loss=2744.672]                                                                     


1.000	Ischemic stroke
0.942	Acute Cerebrovascular Accidents
0.920	Acute ischemic stroke subtype
0.905	Cerebellar stroke
0.900	transient ischemic attack without residual deficits
0.899	Thalamic infarction
0.889	Stroke, Lacunar
0.888	WEAKNESS RESIDUAL
0.888	Embolic Stroke
0.883	Transient Ischemic Attack
[57257052.21484375, 40678626.90625, 35044751.833496094, 31836171.319335938, 29633419.258789062, 27983979.967773438, 26664647.645507812, 25576521.196289062, 24623284.225097656, 23813744.600097656, 23080688.614257812, 22425428.349609375, 21829675.135742188, 21282920.526367188, 20788439.360351562, 20319894.665039062, 19887428.76513672, 19492024.178955078, 19117606.393310547, 18766285.12109375, 18439591.645751953, 18119113.669433594, 17833568.506347656, 17558088.908691406, 17286365.635253906, 17047605.104736328, 16808459.82080078, 16573014.188964844, 16348047.401855469, 16142642.270996094, 15942748.134033203, 15753041.085449219, 15573979.71484375, 15383281.936279297, 15210348.486083984, 15052

In [14]:
for _, i in enumerate(losses):
    if losses[_-1] < i: #if loss is always monitonically decreasing, should print 0 only
        print(_)

In [15]:
# ToDo: Build document vectors from CUI vector components
docvectors = np.zeros((len(cuidata), EMBEDDING_DIM))
for i,doc in tqdm(enumerate(cuidata), total=len(cuidata)):
    for cui in cuidata[doc]['cuis']:
        docvectors[i] += model.in_embeddings.weight.data[vocab[cui]].numpy()


100%|███████████████████████████████████| 29865/29865 [01:16<00:00, 391.28it/s]


In [21]:
# ToDo: Build document vectors from CUI vector components
docvectors = np.zeros((len(cuidata), EMBEDDING_DIM))
for i,doc in tqdm(enumerate(cuidata), total=len(cuidata)):
    tmpvecs = np.zeros(EMBEDDING_DIM)
    for cui in cuidata[doc]['cuis']:
        tmpvecs += model.in_embeddings.weight.data[vocab[cui]].numpy()
    docvectors[i] = tmpvecs / len(cuidata[doc]['cuis'])

100%|███████████████████████████████████| 29865/29865 [01:07<00:00, 439.69it/s]


In [22]:
with open('average_document_vectors_28_FEB_2022.json','w') as outfile:
    json.dump({x:list(y) for x,y in zip(cuidata.keys(), docvectors)}, outfile)

In [16]:
with open("cui_vectors_15_NOV_2021.json",'w') as outfile:
    json.dump({key:model.in_embeddings.weight.data[value].tolist() for key,value in vocab.items()}, outfile)

In [25]:
torch.save(model.state_dict(), 'cui_model_15_NOV_2021.pt')

In [27]:
'''
To load the saved pytorch model:

model = CUIEmbeddingModel(len(vocab), EMBEDDING_DIM) #vocab and embedding dim must match what the model was trained on
model.load_state_dict(torch.load(cui_model_15_NOV_2021.pt))
model.eval() #set to evaluation mode
''';

In [32]:
nearest_embedding_search('C0742946')

1.000	CVA ETIOLOGY HEMORRHAGIC ISCHEMIC
0.847	Acute Cerebrovascular Accidents
0.836	Ischemic stroke
0.829	Left hemiparesis
0.829	Structure of middle cerebral artery
0.822	Thalamic infarction
0.821	Transient Ischemic Attack
0.820	Evaluation
0.815	Acute ischemic stroke subtype
0.815	transient ischemic attack without residual deficits


In [33]:
import xgboost