## Medical Concept Word Embedding Retrieval (OpenAI)

In [1]:
import csv
from pyhealth.medcode.pretrained_embeddings.lm_emb.openai_retriever import embedding_retrieve as embedding_retriever
import numpy as np
from tqdm import tqdm
import pickle
import json
import retrying

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
@retrying.retry(stop_max_attempt_number=5000)
def retrieve_embedding(term):
    return embedding_retriever(term)

### Special Tokens

In [2]:
st_id2emb = {}
special_tokens = ["<pad>", "<unk>"]

for token in tqdm(special_tokens):
    emb = embedding_retriever(term=token)
    st_id2emb[token] = emb

with open(f"../resource/LM/special_tokens/special_tokens.json", "w") as f:
    json.dump(st_id2emb, f)

100%|██████████| 2/2 [00:01<00:00,  1.78it/s]


### CCSCM

In [4]:
ccscm_id2name = {}
with open('../resource/CCSCM.csv', 'r') as f:
    lines = f.readlines()
    for line in lines[1:]:
        line = line.strip().split(',')
        ccscm_id2name[line[0]] = line[1].lower()

ccscm_id2emb = {}
for key in tqdm(ccscm_id2name.keys()):
    emb = embedding_retriever(term=ccscm_id2name[key])
    ccscm_id2emb[key] = emb

with open(f"../resource/embeddings/LM/conditions/ccscm.json", "w") as f:
    json.dump(ccscm_id2emb, f)

### CCSPROC

In [5]:
ccsproc_id2name = {}
with open('../resource/CCSPROC.csv', 'r') as f:
    lines = f.readlines()
    for line in lines[1:]:
        line = line.strip().split(',')
        ccsproc_id2name[line[0]] = line[1].lower()

ccsproc_id2emb = {}
for key in tqdm(ccsproc_id2name.keys()):
    emb = embedding_retriever(term=ccsproc_id2name[key])
    ccsproc_id2emb[key] = emb

with open(f"../resource/embeddings/LM/procedures/ccsproc.json", "w") as f:
    json.dump(ccsproc_id2emb, f)

100%|██████████| 231/231 [00:42<00:00,  5.38it/s]


### ATC

In [11]:
atc_id2name = {}
with open("../resource/ATC.csv", newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        # if row['level'] == '3.0':
        atc_id2name[row['code']] = row['name'].lower()

atc_id2emb = {}
for key in tqdm(atc_id2name.keys()):
    i = 0
    emb = retrieve_embedding(term=atc_id2name[key])
    atc_id2emb[key] = emb

with open(f"../resource/embeddings/LM/drugs/atc.json", "w") as f:
    json.dump(atc_id2emb, f)

100%|██████████| 6440/6440 [25:17<00:00,  4.24it/s]   


### ICD9CM

In [12]:
from pyhealth.medcode import ICD9CM

icd9cm_id2name = {}
with open('../resource/ICD9CM.csv', 'r') as f:
    lines = f.readlines()
    for line in lines[1:]:
        line = line.strip().split(',')
        icd9cm_id2name[line[0]] = line[2].lower()

icd9cm_id2emb = {}
for key in tqdm(icd9cm_id2name.keys()):
    emb = retrieve_embedding(term=icd9cm_id2name[key])
    icd9cm_id2emb[ICD9CM.standardize(key).replace('.', '')] = emb

with open(f"../resource/embeddings/LM/conditions/icd9cm.json", "w") as f:
    json.dump(icd9cm_id2emb, f)

100%|██████████| 17736/17736 [52:29<00:00,  5.63it/s]  


### ICD9PROC

In [13]:
from pyhealth.medcode import ICD9PROC

icd9proc_id2name = {}
with open('../resource/ICD9PROC.csv', 'r') as f:
    lines = f.readlines()
    for line in lines[1:]:
        line = line.strip().split(',')
        icd9proc_id2name[line[0]] = line[2].lower()

icd9proc_id2emb = {}
for key in tqdm(icd9proc_id2name.keys()):
    emb = retrieve_embedding(term=icd9proc_id2name[key])
    icd9proc_id2emb[ICD9PROC.standardize(key).replace('.', '')] = emb

with open(f"../resource/embeddings/LM/procedures/icd9proc.json", "w") as f:
    json.dump(icd9proc_id2emb, f)

100%|██████████| 4670/4670 [44:28<00:00,  1.75it/s]  


In [29]:
with open(f"../resource/embeddings/LM/conditions/icd9cm.json", "r") as f:
    icd9cm_id2emb = json.load(f)

icd9cm_id2emb_new = {}

for key, value in icd9cm_id2emb.items():
    icd9cm_id2emb_new[key.replace('.', '')] = value

with open(f"../resource/embeddings/LM/conditions/icd9cm.json", "w") as f:
    json.dump(icd9cm_id2emb_new, f)

In [2]:
import json

with open(f"../resource/embeddings/LM/gpt3/procedures/icd9proc.json", "r") as f:
    icd9proc_id2emb = json.load(f)

icd9proc_id2emb_new = {}

for key, value in icd9proc_id2emb.items():
    icd9proc_id2emb_new[key.replace('.', '')] = value
    icd9proc_id2emb_new['3605'] = icd9proc_id2emb['0066']
    icd9proc_id2emb_new['3602'] = icd9proc_id2emb['36']

with open(f"../resource/embeddings/LM/gpt3/procedures/icd9proc.json", "w") as f:
    json.dump(icd9proc_id2emb_new, f)