In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pickle
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
os.environ["CUDA_VISIBLE_DEVICES"] = '1,2'

## Load the Description of ICD codes and ATC codes

In [None]:
icd10=pd.read_csv('ICD_10_Desc.csv')
icd9=pd.read_csv('ICD_9_Desc.csv')
meddesc=pd.read_csv('atc_description.csv')

In [None]:
allsentences=[]

for i in range(icd10.shape[0]):
    allsentences.append(icd10['LONG DESCRIPTION (VALID ICD-10 FY2024)'][i])
for i in range(icd9.shape[0]):
    allsentences.append(icd9['LONG DESCRIPTION (VALID ICD-9 FY2024)'][i])
    

dxtokens=[]

for i in range(len(allsentences)):
    s=allsentences[i]
    if pd.isnull(s):
        continue
    dxtokens.append(word_tokenize(s))
    
dx_tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(dxtokens)]
dxmodel = Doc2Vec(dx_tagged_data, vector_size = 32, window = 2, min_count = 1, epochs = 50)

diagnosis_desc_dict={}


for i in range(icd10.shape[0]):
    if pd.isnull(icd10['LONG DESCRIPTION (VALID ICD-10 FY2024)'][i]):
        continue
    code=icd10['CODE'][i][:3].upper()
    if code in dxdict:
        sentencei=icd10['LONG DESCRIPTION (VALID ICD-10 FY2024)'][i]
        tokenized_w=word_tokenize(sentencei)
        test_doc_vector = dxmodel.infer_vector(tokenized_w)
        
        if code in diagnosis_desc_dict:
            diagnosis_desc_dict[code].append(test_doc_vector)
        else:
            diagnosis_desc_dict[code]=[test_doc_vector]
            


for i in range(icd9.shape[0]):
    if pd.isnull(icd9['LONG DESCRIPTION (VALID ICD-9 FY2024)'][i]):
        continue
    code=icd9['CODE'][i][:3].upper()
    if code in dxdict:
        sentencei=icd9['LONG DESCRIPTION (VALID ICD-9 FY2024)'][i]
        tokenized_w=word_tokenize(sentencei)
        test_doc_vector = dxmodel.infer_vector(tokenized_w)
        if code in diagnosis_desc_dict:
            diagnosis_desc_dict[code].append(test_doc_vector)
        else:
            diagnosis_desc_dict[code]=[test_doc_vector]    
    
for dx in diagnosis_desc_dict:
    diagnosis_desc_dict[dx]=np.mean(diagnosis_desc_dict[dx],axis=0)

len(diagnosis_desc_dict)


In [None]:
medsentences=[]

for i in range(meddesc.shape[0]):
    medsentences.append(meddesc['Description'][i])

print(len(medsentences))
medtokens=[]

for i in range(len(medsentences)):
    s=medsentences[i]
    if pd.isnull(s):
        continue
    medtokens.append(word_tokenize(s))

med_tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(medtokens)]

medmodel = Doc2Vec(med_tagged_data, vector_size = 32, window = 2, min_count = 1, epochs = 50)

med_desc_dict={}

for i in range(meddesc.shape[0]):
    if pd.isnull(meddesc['Description'][i]):
        continue
    code=meddesc['Code'][i]
    if code in meddict:
        sentencei=meddesc['Description'][i]
        tokenized_w=word_tokenize(sentencei)
        test_doc_vector = medmodel.infer_vector(tokenized_w)
        med_desc_dict[code]=test_doc_vector

len(med_desc_dict)

## Output the Doc2Vec embeddings for ICD and ATC codes

In [None]:
pickle.dump(diagnosis_desc_dict,open('diagnosis_desc_dict.pkl','wb'))
pickle.dump(med_desc_dict,open('med_desc_dict.pkl','wb'))