In [4]:
import os
import pandas as pd
import pickle
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader


# set seed
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

In [5]:
import torch
torch.manual_seed(42)
np.random.seed(0)


In [6]:
%cd /home/trishad2/PyTrial/

/home/trishad2/PyTrial


In [9]:
data_path = 'lung/data/'

In [10]:
data= pd.read_csv(data_path+'full_datav3.csv',  index_col = 0)

ae_cols = [i for i in data.columns if i.startswith('AE_')]
med_cols = [i for i in data.columns if i.startswith('CM_')]
treatment_cols = [i for i in data.columns if i.startswith('Treatment_')]
feature_cols =    treatment_cols + med_cols + ae_cols
vocab_size = [len(treatment_cols), len(med_cols), len(ae_cols)]
vocab_size

[3, 13, 83]

In [11]:
treatment_id_dict={}
for i in range(len(treatment_cols)):
    treatment_id_dict[i]=treatment_cols[i].split('_')[1]
    
med_id_dict={}
for i in range(len(med_cols)):
    med_id_dict[i]=med_cols[i].split('_')[1]
    
ae_id_dict={}
for i in range(len(ae_cols)):
    ae_id_dict[i]=ae_cols[i].split('_')[1]

In [12]:
from transformers import BertModel, BertTokenizer



# Load the BioBERT model and tokenizer
model = BertModel.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')

Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.1 were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
med_id_dict

{0: 'APREPITANT',
 1: 'DEXAMETHASONE',
 2: 'FUROSEMIDE',
 3: 'GRANISETRON',
 4: 'MANNITOL',
 5: 'METHYLPREDNISOLONE SODIUM SUCCINATE',
 6: 'METOCLOPRAMIDE',
 7: 'ONDANSETRON',
 8: 'ONDANSETRON HYDROCHLORIDE',
 9: 'PALONOSETRON',
 10: 'PREDNISOLONE',
 11: 'SODIUM CHLORIDE',
 12: 'nan'}

In [15]:
# Create a new dictionary for embeddings
medical_codes_embeddings = {}

# Encode names and get word embeddings
for code, name in med_id_dict.items():
    tokens = tokenizer.tokenize(name)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    outputs = model(torch.tensor([input_ids]))
    word_embedding = outputs.last_hidden_state.mean(dim=1)
    medical_codes_embeddings[code] = word_embedding



In [16]:
ae_codes_embeddings={}
# Encode names and get word embeddings
for code, name in ae_id_dict.items():
    tokens = tokenizer.tokenize(name)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    outputs = model(torch.tensor([input_ids]))
    word_embedding = outputs.last_hidden_state.mean(dim=1)
    ae_codes_embeddings[code] = word_embedding


In [20]:
with open(data_path+'med_emb_dict.pickle', 'wb') as handle:
    pickle.dump(medical_codes_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [21]:
with open(data_path+'ae_emb_dict.pickle', 'wb') as handle:
    pickle.dump(ae_codes_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
treatment_id_dict

{0: 'CISPLATIN', 1: 'COMBINED THERAPY', 2: 'GEMCITABINE'}

In [19]:
treatment_codes_embeddings={}
# Encode names and get word embeddings
for code, name in treatment_id_dict.items():
    tokens = tokenizer.tokenize(name)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    outputs = model(torch.tensor([input_ids]))
    word_embedding = outputs.last_hidden_state.mean(dim=1)
    treatment_codes_embeddings[name] = word_embedding


In [22]:
with open(data_path+'treatment_emb_dict.pickle', 'wb') as handle:
    pickle.dump(treatment_codes_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)