In [137]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [138]:
# Seq-to-Seq problem to infer 'guidance'... 
# pre-trained disorder guidance encoder...
# produces disorders / disorder and problems / disorders and problems and intervnetions during abstractive summarisation

In [139]:
import json
import numpy as np
import pandas as pd
from collections import Counter
from functools import reduce
from medcat.cat import CAT
from torch import nn

In [2]:
total_rec = 200

In [3]:
f = open('../../hadms_to_dis_course_processed.jsonl')
dis_course_notes = pd.DataFrame([json.loads(f.readline())for i in range(total_rec)])

In [4]:
f = open('../../hadms_to_hosp_course_processed.jsonl')
src_notes = pd.DataFrame([json.loads(f.readline())for i in range(total_rec)])

In [5]:
# problem types i.e.  disorders, findings 
disorder = ['T-11']
problem_types = ['T-11', 'T-18', 'T-29', 'T-35', 'T-38']
# # descriptions of problem, i.e. locations, temperature, respiratory rate.
# descriptors = ['T-6']
# interventions, i.e. investigative scans, procedures, drugs 
interventions = ['T-9', 'T-26', 'T-27', 'T-39', 'T-40', 'T-55']

all_types = problem_types + interventions

In [6]:
def filter_ents(row):
    found_ents = []
    for e in row['ents']['entities'].values():
        anns = e['meta_anns']
        if anns['Subject']['value'] == 'Patient' and anns['Presence']['value'] == 'True' and anns['Time']['value'] == 'Recent':
            if any(t in all_types for t in e['type_ids']):
                found_ents.append({'pretty_name': e['pretty_name'], 'cui': e['cui'], 
                                   'source_value': e['source_value'], 'types': e['type_ids']})
    return found_ents

In [7]:
src_notes['filtered_ents'] = src_notes.apply(filter_ents, axis=1)

In [8]:
dis_course_notes['filtered_ents'] = dis_course_notes.apply(filter_ents, axis=1)

In [None]:
cat = CAT.load_model_pack('../../mc_modelpack_phase2_snomed_190k_october_2021.zip')

In [10]:
def ents_embed(ents, disorders_only=False, problems_only=False):
    arr = []
    for e in ents:
        if len(cat.cdb.cui2context_vectors[e['cui']]) != 0:
            arr.append(cat.cdb.cui2context_vectors[e['cui']]['long'])
    return np.array(arr)

In [11]:
src_notes['ents_embed_disorders'] = src_notes['filtered_ents'].apply(ents_embed, disorders_only=True)
src_notes['ents_embed_problems'] = src_notes['filtered_ents'].apply(ents_embed, problems_only=True)
src_notes['ents_embed_all'] = src_notes['filtered_ents'].apply(ents_embed)
dis_course_notes['ents_embed_problems'] = dis_course_notes['filtered_ents'].apply(ents_embed, disorders_only=True)
dis_course_notes['ents_embed_problems'] = dis_course_notes['filtered_ents'].apply(ents_embed, problems_only=True)
dis_course_notes['ents_embed_all'] = dis_course_notes['filtered_ents'].apply(ents_embed)

In [None]:
[(e['cui'], e['pretty_name']) for e in src_notes.iloc[0]['filtered_ents']]

In [None]:
[(e['cui'], e['pretty_name']) for e in dis_course_notes.iloc[0]['filtered_ents']]

In [None]:
# prompt engineering with n-sent extractive models at the top.... 

In [None]:
# sequence order of the condition implies the importance... two different encoders...

In [None]:
[v['pretty_name'].split('(')[0] for v in src_notes.ents[0]['entities'].values()]

In [None]:
[v['pretty_name'].split('(')[0] for v in dis_course_notes.ents[0]['entities'].values()]

In [132]:
src_notes.shape

(200, 7)

In [None]:
# cat.cdb.cui2contextvectors: 
# Vocab - all cuis input...
# don't need an index mapping from word to an index..

In [140]:
# seq2seq model for problem list guidance... 
from concept_encoder_decoder import DecoderRNN, EncoderRNN, \
        train_epochs, train_loop, tensorFromSentence, evaluate

In [15]:
src_names_list = reduce(lambda a, b: set(a) | set(b), src_notes['filtered_ents'].apply(lambda l: [e['pretty_name'] for e in l]))

In [16]:
bhc_names_list = reduce(lambda a, b: set(a) | set(b), dis_course_notes['filtered_ents'].apply(lambda l: [e['pretty_name'] for e in l]))

In [17]:
lang = ConceptLang()
for c in src_names_list | bhc_names_list:
    lang.addWord(c)

In [45]:
src_notes_tensors = src_notes['filtered_ents'].apply(lambda ents: tensorFromSentence(lang, [e['pretty_name'] for e in ents]))

In [46]:
bhc_notes_tensors = dis_course_notes['filtered_ents'].apply(lambda ents: tensorFromSentence(lang, [e['pretty_name'] for e in ents]))

In [54]:
src_notes_tensors = [t[:100] if len(t) > 100 else t for t in src_notes_tensors]
bhc_notes_tensors = [t[:100] if len(t) > 100 else t for t in bhc_notes_tensors]

In [101]:
src_notes_tensors[0].shape

torch.Size([100, 1])

# Train Example Model

In [26]:
hidden_size = 256
encoder = EncoderRNN(lang.n_words, hidden_size).to(device)
decoder = DecoderRNN(hidden_size, lang.n_words, dropout=0.1).to(device)

In [55]:
len(src_notes_tensors[0])

100

In [89]:
encoder

EncoderRNN(
  (embedding): Embedding(2234, 256)
  (gru): GRU(256, 256)
)

In [None]:
train_epochs(encoder, decoder, src_notes_tensors, bhc_notes_tensors, epochs=20)

2.5088
2.4286
2.3255
2.2371
1.9959
1.7422
1.7516
1.5216
1.3634
1.2719
1.1416
1.0051
0.8694
0.7247
0.5942


In [142]:
evaluate(encoder, decoder, src_notes_tensors, bhc_notes_tensors)

10


RuntimeError: input.size(-1) must be equal to input_size. Expected 256, got 25600

In [122]:
cuis_list = reduce(lambda a, b: set(a) | set(b), src_notes['filtered_ents'].apply(lambda l: [e['cui'] for e in l]))

In [123]:
cui_vecs = [(c, cat.cdb.cui2preferred_name[c], cat.cdb.cui2context_vectors[c]) for c in cuis_list if 'long' not in cat.cdb.cui2context_vectors[c]]

In [124]:
first_row_cuis = set([e['cui'] for e in src_notes.filtered_ents[1] if 'T-11' in e['types']])

In [26]:
cui2vec = {c: cat.cdb.cui2context_vectors[c] for c in first_row_cuis}

In [None]:
cat.cdb.cui2context_vectors[list(first_row_cuis)[0]]