# Data Loading and Preprocessing

### Imports: 
1. VO_RXN - All VO concepts with a RxNorm annotation
2. RX_DF_ALL - All *vaccine related* concepts within RxNorm
3. VO_DF_FULL - All VO concepts under Vaccine and Vaccine Component subgroups (includes both with and without RxNorm annotations)
4. VO_DF_APPLY - All VO concepts that do not contain RxNorm annotation

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import torch
import pickle

In [140]:
vo_columns = {'ID':'str', 'VO_STR':'str', 'RXN':'Int64'}
rx_columns = {'RXN':'Int64', 'RX_STR':'str','TTY':'category'}

VO_RXN = pd.read_csv('VO_RXN.csv',header=0, names = vo_columns.keys(),dtype=vo_columns)

RX_DF = pd.read_csv('2024_02_05_RXN_Concepts.csv', names=rx_columns.keys(), dtype=rx_columns)

RX_DF = RX_DF[~RX_DF['TTY'].isin(['PSN','SY'])]

RX_DF_ALL = pd.read_csv('RXN_Concepts.csv',header=0, names=rx_columns.keys(), dtype=rx_columns)

In [141]:
RX_DF_ALL

Unnamed: 0,RXN,RX_STR,TTY
0,7288,Neisseria meningitidis,IN
1,8080,pertussis vaccine,IN
2,29501,meningococcal group A polysaccharide,IN
3,29503,meningococcal group C polysaccharide,IN
4,50937,Haemophilus influenzae type b,IN
...,...,...,...
2301,2630660,0.5 ML influenza A virus A/Darwin/9/2021 (H3N2...,SBD
2302,2630661,influenza A virus A/Darwin/9/2021 (H3N2) antig...,SBD
2303,2630662,influenza A virus A/Darwin/9/2021 (H3N2) antig...,SCD
2304,2630663,influenza A virus (H1N1) antigen / influenza A...,SBDF


In [142]:
# VO_DF_FULL = pd.read_csv('VO_DF_FULL.csv',header=0, names=vo_coluRXNmns.keys(), dtype=vo_columns)
# VO_DF_FULL

In [143]:
VO_DF_FULL = pd.read_csv('VO_DF_FULL.csv')
VO_DF_FULL = VO_DF_FULL[['ID','Label','RXN']]
VO_DF_FULL.columns = vo_columns.keys()
# VO_DF_FULL = VO_DF_FULL.astype(vo_columns, errors='ignore')
# VO_DF_FULL.dtypes

In [144]:
!ls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024_02_05_RXN_Concepts.csv  RXN_Concepts.csv	   VO_DF_FULL_TRIMMED.csv
ALL_UMLS_CONCEPTS.csv	     UMLS_CONCEPTS.csv	   VO_RXN[archived].ipynb
models			     UMLS_Vac_Terms.ipynb  VO_RXN.csv
outputs			     VO_DF_FULL.csv	   VO_RXN.ipynb


In [145]:
VO_DF_FULL = pd.read_csv('VO_DF_FULL_TRIMMED.csv', header=0, names=vo_columns.keys(), dtype=vo_columns)
VO_DF_FULL.dtypes

ID        object
VO_STR    object
RXN        Int64
dtype: object

In [146]:
VO_DF_FULL['RXN'] = pd.to_numeric(VO_DF_FULL['RXN'], errors = 'coerce')

In [147]:
VO_DF_FULL

Unnamed: 0,ID,VO_STR,RXN
0,VO_0001854,Feline Rhinotracheitis-Calici-Panleukopenia Ki...,
1,VO_0003859,Imovax Injectable Product,1659733
2,VO_0002354,Bovine Rhinotracheitis-Virus Diarrhea-Parainfl...,
3,VO_0001476,sheep vaccine,
4,VO_0004363,M. tuberculosis DNA vaccine (containing the ES...,
...,...,...,...
5763,VO_0001052,poliovirus vaccine vector,
5764,VO_0003346,Streptococcus pneumoniae serotype 9V capsular ...,798232
5765,VO_0015016,"Haemophilus influenzae type b, capsular polysa...",1658069
5766,VO_0000099,Pcaggs DNA vaccine plasmid,


In [148]:
# VO_DF_FULL.to_csv('VO_DF_FULL_TRIMMED.csv', index=False)

In [149]:
#Testing VO_DF_FULL 
VO_DF_FULL[VO_DF_FULL['ID']=='VO_0003383']

Unnamed: 0,ID,VO_STR,RXN
2142,VO_0003383,"diphtheria toxoid vaccine, inactivated 5 UNT/ML",798305
5509,VO_0003383,"diphtheria toxoid vaccine, inactivated 5 UNT/ML",798305


In [150]:
VO_DF_FULL = VO_DF_FULL[VO_DF_FULL['RXN']!=1005930]

In [151]:
RX_DF_ALL = RX_DF_ALL[RX_DF_ALL['RXN'].isin(VO_DF_FULL['RXN'])]
RX_DF_ALL

Unnamed: 0,RXN,RX_STR,TTY
2,29501,meningococcal group A polysaccharide,IN
3,29503,meningococcal group C polysaccharide,IN
6,114883,Typhim VI,BN
7,139056,COMVAX,BN
11,203439,Havrix,BN
...,...,...,...
1806,2054182,Flumist Quadrivalent 2018-2019,BN
1808,2054270,influenza A virus A/Singapore/INFIMH-16-0019/2...,SCDC
1809,2054271,influenza B virus B/Colorado/06/2017 antigen 1...,SCDC
1811,2054273,influenza A virus A/Singapore/INFIMH-16-0019/2...,SBDC


In [152]:
RX_DF_ALL[RX_DF_ALL['RXN']==2054182]
# 1005930

Unnamed: 0,RXN,RX_STR,TTY
1806,2054182,Flumist Quadrivalent 2018-2019,BN


### Preprocessing: 
The following data preprocessing steps will be conducted:

1. Generate embeddings for VO concepts in VO_DF_FULL
2. Generate embeddings for RxNorm concepts in RX_DF_ALL
3. Convert embeddings dataframe to a lookup dictionary


**NOTE:** There are some RxNorm concepts that have been remapped and such cannot be retrieved. These will be excluded.

**ToDo:** Update remapped/obsolete concepts to their most recent concept

In [153]:
torch.cuda.device_count()

8

In [154]:
# import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"

In [155]:
# Check if GPU is available
if torch.cuda.is_available():
    # Set the device to GPU
    device = torch.device('cuda:2')
    print('Using GPU:', torch.cuda.get_device_name(device))
else:
    device = torch.device('cpu')
    print('GPU not available, using CPU.')

Using GPU: NVIDIA A100-SXM4-80GB


In [156]:
st_model = SentenceTransformer('tavakolih/all-MiniLM-L6-v2-pubmed-full', device = device)

In [157]:
#Sentence Embedding and preprocessing
VO_DF_FULL['VO_EMB'] = VO_DF_FULL['VO_STR'].map(lambda x: st_model.encode(x,device=device))
RX_DF_ALL['RX_EMB'] = RX_DF_ALL['RX_STR'].map(lambda x: st_model.encode(x,device=device))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  VO_DF_FULL['VO_EMB'] = VO_DF_FULL['VO_STR'].map(lambda x: st_model.encode(x,device=device))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  RX_DF_ALL['RX_EMB'] = RX_DF_ALL['RX_STR'].map(lambda x: st_model.encode(x,device=device))


In [158]:
del st_model
torch.cuda.empty_cache()

In [159]:
#Create lookup dictionaries for VO_EMB and RX_EMB
VO_LOOKUP = dict(zip(VO_DF_FULL['ID'], VO_DF_FULL['VO_EMB']))
RX_LOOKUP = dict(zip(RX_DF_ALL['RXN'], RX_DF_ALL['RX_EMB']))

In [160]:
# tl = [VO_LOOKUP['VO_0001854'],VO_LOOKUP['VO_0001854']]
# sum([len(x) for x in tl])

In [161]:
#Create lookup dictionaries for VO and RX Labels
VO_LOOKUP_STR = dict(zip(VO_DF_FULL['ID'], VO_DF_FULL['VO_STR']))
RX_LOOKUP_STR = dict(zip(RX_DF_ALL['RXN'], RX_DF_ALL['RX_STR']))

In [162]:
VO_DF_APPLY = VO_DF_FULL[VO_DF_FULL['RXN'].isna()]

In [163]:
VO_DF_APPLY.dtypes

ID        object
VO_STR    object
RXN        Int64
VO_EMB    object
dtype: object

### Generation of unmapped pairs: 
    To generate unmapped, for each mapped concept pair (A,B), let unmapped pairs be (A,X) where X is all of Rxnorm which is not B.
    
1. VO_RXN_LIST - List of existing VO-Rx pairs
2. VO_LIST - Paired VO concepts
3. VO_APPLY_LIST - Unpaired VO concepts
4. RXN_LIST - All RxNorom concepts related to vaccines

Generation of unmapped pairs will be done as using *product* function of itertools package: 

1. NO_VO_RXN_LIST - Artificial unmapped pairs using paired VO concepts and RxNorm concepts that are NOT in known pairs (VO_RXN_LIST) - **TEST**
2. VO_RXN_APPLY_LIST - Possible apply set of VO concepts that do not have RxNorm annotations. Could contain possible mappings/pairings - **APPLY**
3. Converting lists in 1. and 2. above into dataframes (NO_VO_RXN_DF and VO_RXN_APPLY_DF)

In [164]:
VO_RXN = VO_RXN.drop_duplicates()

In [166]:
# Function to check if embeddings are identical
def check_identical(row):
    vo_emb = VO_LOOKUP.get(row['ID'])
    # rx_emb = RX_LOOKUP.get(row['RXN'].astype(str))
    rx_emb = RX_LOOKUP.get(row['RXN'])

    if vo_emb is not None and rx_emb is not None and np.array_equal(vo_emb, rx_emb):
        return 1
    else:
        return 0

In [167]:
# VO_RXN = VO_RXN.loc[VO_RXN['RXN'].astype(str).isin(RX_LOOKUP.keys())]
VO_RXN = VO_RXN.loc[VO_RXN['RXN'].isin(RX_LOOKUP.keys())]

In [168]:
VO_RXN.loc[:,'PAIRED'] = 1
VO_RXN.loc[:,'IDENTICAL'] = VO_RXN.apply(check_identical, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  VO_RXN.loc[:,'PAIRED'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  VO_RXN.loc[:,'IDENTICAL'] = VO_RXN.apply(check_identical, axis=1)


In [169]:
VO_RXN[['PAIRED','IDENTICAL']].astype(str).apply(pd.Series.value_counts)

Unnamed: 0,PAIRED,IDENTICAL
0,,285
1,868.0,583


In [170]:
VO_RXN[VO_RXN['ID']=='VO_0000003']

Unnamed: 0,ID,VO_STR,RXN,PAIRED,IDENTICAL
739,VO_0000003,ACAM2000,833083,1,1


In [171]:
# Removing identical concept pairs from VO_RXN_DF [TRAIN]
VO_RXN_DF= VO_RXN[VO_RXN['IDENTICAL']==0]

In [172]:
#Removing Obsolete concepts - ToDo: Find new mappings from RxNorm
VO_RXN_DF = VO_RXN_DF[VO_RXN_DF['RXN'].isin(RX_LOOKUP.keys())]

In [None]:
# VO_RXN_LIST = list(zip(VO_RXN['ID'], VO_RXN['RXN']))
# VO_LIST = list(VO_RXN['ID'])
# RXN_LIST_SMALL = list(VO_RXN['RXN'])
# VO_APPLY_LIST = list(VO_DF_APPLY['ID'])
# # RXN_LIST = list(VO_RXN['RXN'])
# RXN_LIST = list(RX_DF_ALL['RXN'])

In [184]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.nn import functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split

In [185]:
train_data, test_data_1 = train_test_split(VO_RXN_DF, test_size=0.3, random_state=42)

In [204]:
test_data_1

Unnamed: 0,ID,RXN,PAIRED
9,VO_0003380,1658239,1
249,VO_0003637,1801186,1
157,VO_0003686,1801163,1
209,VO_0003977,830556,1
75,VO_0003383,798305,1
...,...,...,...
124,VO_0003309,854957,1
216,VO_0003217,854934,1
84,VO_0003782,1657142,1
152,VO_0003799,1986830,1


In [213]:
VO_RXN_LIST = list(zip(VO_RXN_DF['ID'], VO_RXN_DF['RXN']))
VO_LIST = list(test_data_1['ID'])
RXN_LIST_SMALL = list(VO_RXN_DF['RXN'])
VO_APPLY_LIST = list(VO_DF_APPLY['ID'])
# RXN_LIST = list(VO_RXN['RXN'])
RXN_LIST = list(RX_DF_ALL['RXN'])

In [214]:
from itertools import product
# NO_VO_RXN_LIST = list(product(VO_LIST, RXN_LIST)) #This is the entire negative space. Too large
NO_VO_RXN_LIST = list(product(VO_LIST, RXN_LIST_SMALL))
VO_RXN_APPLY_LIST = list(product(VO_APPLY_LIST, RXN_LIST))
# print(len(NO_VO_RXN_LIST))
NO_VO_RXN_LIST = [x for x in NO_VO_RXN_LIST if x not in VO_RXN_LIST]

In [175]:
# len(NO_VO_RXN_LIST)
('VO_0003623', 1005930) in NO_VO_RXN_LIST

False

In [215]:
# len(VO_LIST)*len(RXN_LIST)
len(NO_VO_RXN_LIST)

24421

In [177]:
output_dir = 'outputs/'

In [216]:
import datetime
def cdt():
    return datetime.datetime.now().strftime("%Y_%m_%d")

In [217]:
#Converting NO_VO_RXN_LIST (list of lists) to a dataframe
NO_VO_RXN_DF = pd.DataFrame(NO_VO_RXN_LIST, columns=['ID', 'RXN'])

In [218]:
NO_VO_RXN_DF[NO_VO_RXN_DF['RXN']==1005930]

Unnamed: 0,ID,RXN


In [181]:
#Converting VO_RXN_APPLY_LIST (list of lists) to a dataframe
VO_RXN_APPLY_DF = pd.DataFrame(VO_RXN_APPLY_LIST, columns=['ID', 'RXN'])

In [219]:
NO_VO_RXN_DF['PAIRED'] = 0
NO_VO_RXN_DF['IDENTICAL'] = NO_VO_RXN_DF.apply(check_identical, axis=1)
NO_VO_RXN_DF = NO_VO_RXN_DF[NO_VO_RXN_DF['IDENTICAL']==0]

In [None]:
# # pickle VO_RXN_DF and NO_VO_RXN_DF
# with open(output_dir+'VO_RXN_DF_'+cdt()+'.pkl', 'wb') as f:
#     pickle.dump(VO_RXN_DF, f)

In [None]:
# with open(output_dir+'NO_VO_RXN_DF_'+cdt()+'.pkl', 'wb') as f:
#     pickle.dump(NO_VO_RXN_DF, f)

In [None]:
# with open(output_dir+'VO_RXN_DF_'+cdt()+'.pkl', 'rb') as f:
#   VO_RXN_DF = pickle.load(f)

In [None]:
# with open(output_dir+'NO_VO_RXN_DF_'+cdt()+'.pkl', 'rb') as f:
#   NO_VO_RXN_DF = pickle.load(f)

In [220]:
VO_RXN_DF = VO_RXN_DF[['ID','RXN','PAIRED']].reset_index(drop=True)
NO_VO_RXN_DF = NO_VO_RXN_DF[['ID','RXN','PAIRED']].reset_index(drop=True)

# Autoencoder: Dataset Creation, Training and Evaluating

### Dataset and DataLoader:

**Note:**

* VO_RXN_DF - Existing mappings between VO and RXN used for Training AND Testing
* NO_VO_RXN_DF - Artificial mappings (presumed to be UNMAPPED) used for Testing

In [221]:
test_data = pd.concat([test_data_1, NO_VO_RXN_DF], ignore_index=True)

In [222]:
test_data['PAIRED'].value_counts()

PAIRED
0    24421
1       86
Name: count, dtype: int64

In [223]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, vo_lookup, rx_lookup, device, transform=None):
        self.dataframe = dataframe
        self.vo_lookup = vo_lookup
        self.rx_lookup = rx_lookup
        self.device = device
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        sample = self.dataframe.iloc[idx]
        vo_emb = self.vo_lookup.get(sample['ID'])
        rx_emb = self.rx_lookup.get(sample['RXN'])
        emb_items = [vo_emb, rx_emb]
        # print(sample['ID'],sample['RXN'])
        # print(len(vo_emb)+len(rx_emb))
        concept_1 , concept_2 = sample['ID'], sample['RXN']
        emb_len = sum([len(item) for item in emb_items])
        # print(concept_1, concept_2)
        # print(len(vo_emb) , len(rx_emb))
        # features = sample['EMB']
        # features = torch.tensor(np.concatenate([vo_emb, rx_emb])).reshape(1, 768).to(self.device)
        features = torch.tensor(np.concatenate([vo_emb, rx_emb])).reshape(1, emb_len).to(self.device)
        label = sample['PAIRED']
        index = idx
        # features = torch.tensor(features).reshape(1,768).to(device)
        label = torch.tensor(label).to(device)

        if self.transform:
            features = self.transform(features)

        return features, label, index, concept_1, concept_2

In [None]:
test_data

In [189]:
test_data[test_data['RXN']==1005930]

Unnamed: 0,ID,RXN,PAIRED


In [224]:
# Assuming you have a DataFrame 'train_df' and 'test_df' containing your data
train_dataset = CustomDataset(train_data, VO_LOOKUP, RX_LOOKUP, device)
test_dataset = CustomDataset(test_data, VO_LOOKUP, RX_LOOKUP, device)
# test_dataset = CustomDataset(test_data_1, VO_LOOKUP, RX_LOOKUP, device)

# Create DataLoader instances for batching and shuffling
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

### Autoencoder:

#### Training:

In [191]:
class Autoencoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_size, input_size),
            nn.ReLU()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

In [192]:
def train_autoencoder(model, train_loader, num_epochs, learning_rate):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    loss_list = []

    for epoch in range(num_epochs):
        running_loss = 0.0
        for data in train_loader:
            inputs, _, _, _, _ = data

            optimizer.zero_grad()
            encoded, outputs = model(inputs)
            loss = criterion(outputs, inputs)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            loss_list.append(loss.item())

        # Print the average loss for each epoch
        if epoch % 10 == 9:
          print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader)}')

    threshold = np.percentile(loss_list, 95)
    print(f'Threshold: {threshold}')
    return threshold, loss_list

In [193]:
ae_model = Autoencoder(input_size=768, hidden_size=256).to(device)

In [194]:
threshold, loss_list = train_autoencoder(ae_model, train_loader, num_epochs=100, learning_rate=0.001)

Epoch [10/100], Loss: 0.0018956582352984697
Epoch [20/100], Loss: 0.0017536559898871928
Epoch [30/100], Loss: 0.0017113903595600277
Epoch [40/100], Loss: 0.0016669996839482337
Epoch [50/100], Loss: 0.001668843993684277
Epoch [60/100], Loss: 0.0016523552767466754
Epoch [70/100], Loss: 0.0016353614919353276
Epoch [80/100], Loss: 0.001636049128137529
Epoch [90/100], Loss: 0.0016317009867634624
Epoch [100/100], Loss: 0.0016433454293292016
Threshold: 0.001997831033077091


In [195]:
np.mean(loss_list)

0.0017199153653928078

#### Testing:

In [None]:
def evaluate_autoencoder_gold(model, test_loader,threshold):
    model.eval()
    correct = 0
    total_samples = 0
    misidentified = []
    mis_loss = []
    pred = []
    lab = []

    with torch.no_grad():
        criterion = nn.MSELoss(reduction='none')
        for data in test_loader:
            inputs, label, idx, c1, c2 = data
            # print(inputs.shape)
            # print(c_pair)
            label = label.cpu()
            encoded, outputs = model(inputs)
            loss = criterion(outputs, inputs)
            loss_ind = torch.mean(loss, dim=(1, 2)).cpu()
            predicted = torch.from_numpy(np.where(loss_ind > threshold, 0, 1))
            misidentified.extend(idx[predicted!=label].tolist())
            mis_loss.extend(loss_ind[predicted!=label].tolist())
            pred.extend(predicted[predicted!=label].tolist())
            lab.extend(label[predicted!=label].tolist())
            # print(predicted)
            # print(label)
            correct += (predicted == label).sum().item()
            total_samples += label.size(0)
        accuracy = correct / total_samples
        print(f'Accuracy: {accuracy:.4f}')

    return correct, total_samples, misidentified, mis_loss , pred, lab

In [196]:
from sklearn.metrics import precision_recall_fscore_support

def evaluate_autoencoder_gold_new(model, test_loader, threshold):
    model.eval()
    correct = 0
    total_samples = 0
    misidentified = []
    mis_loss = []
    mis_pred = []
    mis_lab = []
    true_labels = []
    pred_labels = []

    with torch.no_grad():
        criterion = nn.MSELoss(reduction='none')
        for data in test_loader:
            inputs, label, idx, c1, c2 = data
            label = label.cpu()
            encoded, outputs = model(inputs)
            loss = criterion(outputs, inputs)
            loss_ind = torch.mean(loss, dim=(1, 2)).cpu()
            predicted = torch.from_numpy(np.where(loss_ind > threshold, 0, 1))
            pred_labels.extend(predicted.tolist())
            
            # Misidentified Section
            misidentified.extend(idx[predicted!=label].tolist())
            mis_loss.extend(loss_ind[predicted!=label].tolist())
            mis_pred.extend(predicted[predicted!=label].tolist())
            mis_lab.extend(label[predicted!=label].tolist())
            
            true_labels.extend(label.tolist())
            correct += (predicted == label).sum().item()
            total_samples += label.size(0)

        accuracy = correct / total_samples
        precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average=None)

        print(f'Accuracy: {accuracy:.4f}')
        print(f'Precision for class 0: {precision[0]:.4f}')
        print(f'Precision for class 1: {precision[1]:.4f}')
        print(f'Recall for class 0: {recall[0]:.4f}')
        print(f'Recall for class 1: {recall[1]:.4f}')
        print(f'F1 Score for class 0: {f1[0]:.4f}')
        print(f'F1 Score for class 1: {f1[1]:.4f}')

    return correct, total_samples, misidentified, mis_loss, pred_labels, true_labels


In [225]:
from sklearn.metrics import precision_recall_fscore_support

def evaluate_autoencoder_gold_batchv2(model, test_loader, threshold):
    model.eval()
    correct = 0
    total_samples = 0
    true_labels = []
    pred_labels = []
    loss_list = []
    index = []

    with torch.no_grad():
        criterion = nn.MSELoss(reduction='none')
        for data in test_loader:
            inputs, label, idx, c1, c2 = data
            label = label.cpu()
            encoded, outputs = model(inputs)
            loss = criterion(outputs, inputs)
            loss_ind = torch.mean(loss, dim=(1, 2)).cpu()
            loss_list.extend(loss_ind.tolist())
            predicted = torch.from_numpy(np.where(loss_ind > threshold, 0, 1))
            pred_labels.extend(predicted.tolist())
            index.extend(idx.tolist())
            # # Misidentified Section
            # misidentified.extend(idx[predicted!=label].tolist())
            # mis_loss.extend(loss_ind[predicted!=label].tolist())
            # mis_pred.extend(predicted[predicted!=label].tolist())
            # mis_lab.extend(label[predicted!=label].tolist())
            
            true_labels.extend(label.tolist())
            correct += (predicted == label).sum().item()
            total_samples += label.size(0)

        accuracy = correct / total_samples
        precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average=None)

        print(f'Accuracy: {accuracy:.4f}')
        print(f'Precision for class 0: {precision[0]:.4f}')
        print(f'Precision for class 1: {precision[1]:.4f}')
        print(f'Recall for class 0: {recall[0]:.4f}')
        print(f'Recall for class 1: {recall[1]:.4f}')
        print(f'F1 Score for class 0: {f1[0]:.4f}')
        print(f'F1 Score for class 1: {f1[1]:.4f}')

    return correct, total_samples, pred_labels, true_labels, loss_list, index


In [None]:
# VO_LOOKUP['VO_0003623']

In [None]:
# RX_LOOKUP[1005930]

In [None]:
# RX_LOOKUP.keys()

In [None]:
correct, total_samples, misidentified, miss_loss, pred, lab = evaluate_autoencoder_gold_new(ae_model, test_loader,threshold=threshold)

In [226]:
correct, total_samples, pred_labels, true_labels, loss_list, index = evaluate_autoencoder_gold_batchv2(ae_model, test_loader,threshold=threshold)

Accuracy: 0.0233
Precision for class 0: 0.9980
Precision for class 1: 0.0035
Recall for class 0: 0.0199
Recall for class 1: 0.9884
F1 Score for class 0: 0.0391
F1 Score for class 1: 0.0071


In [None]:
# correct, total_samples, misidentified, miss_loss, pred, lab = evaluate_autoencoder_gold(ae_model, test_loader,threshold=threshold)

In [227]:
correct, total_samples

(572, 24507)

In [228]:
res_dataset = test_data.reset_index().iloc[index]
res_dataset['LOSS'] = loss_list
res_dataset

Unnamed: 0,index,ID,RXN,PAIRED,LOSS
0,0,VO_0003380,1658239,1,0.001643
1,1,VO_0003637,1801186,1,0.001538
2,2,VO_0003686,1801163,1,0.001578
3,3,VO_0003977,830556,1,0.001618
4,4,VO_0003383,798305,1,0.002013
...,...,...,...,...,...
24502,24502,VO_0003448,1427052,0,0.001922
24503,24503,VO_0003448,1664639,0,0.002027
24504,24504,VO_0003448,1812935,0,0.001995
24505,24505,VO_0003448,1657331,0,0.001973


In [229]:
res_dataset = res_dataset.sort_values(by=['ID', 'LOSS'], ascending=[True, True])
res_dataset

Unnamed: 0,index,ID,RXN,PAIRED,LOSS
23405,23405,VO_0003217,854966,0,0.001737
23492,23492,VO_0003217,854967,0,0.001746
23508,23508,VO_0003217,854948,0,0.001752
23473,23473,VO_0003217,854940,0,0.001753
23403,23403,VO_0003217,854960,0,0.001754
...,...,...,...,...,...
6329,6329,VO_0015068,1812935,0,0.001912
6226,6226,VO_0015068,833086,0,0.001916
6088,6088,VO_0015068,1986826,0,0.001957
6328,6328,VO_0015068,1664639,0,0.001996


In [230]:
res_dataset['VO_STR'] = res_dataset['ID'].map(VO_LOOKUP_STR)
res_dataset['RX_STR'] = res_dataset['RXN'].map(RX_LOOKUP_STR)

In [231]:
res_dataset

Unnamed: 0,index,ID,RXN,PAIRED,LOSS,VO_STR,RX_STR
23405,23405,VO_0003217,854966,0,0.001737,pneumococcal capsular polysaccharide type 11A ...,Streptococcus pneumoniae type 6B capsular poly...
23492,23492,VO_0003217,854967,0,0.001746,pneumococcal capsular polysaccharide type 11A ...,Streptococcus pneumoniae type 6B capsular poly...
23508,23508,VO_0003217,854948,0,0.001752,pneumococcal capsular polysaccharide type 11A ...,Streptococcus pneumoniae type 19F capsular pol...
23473,23473,VO_0003217,854940,0,0.001753,pneumococcal capsular polysaccharide type 11A ...,Streptococcus pneumoniae type 15B capsular pol...
23403,23403,VO_0003217,854960,0,0.001754,pneumococcal capsular polysaccharide type 11A ...,Streptococcus pneumoniae type 33F capsular pol...
...,...,...,...,...,...,...,...
6329,6329,VO_0015068,1812935,0,0.001912,Neisseria meningitidis serogroup Y capsular po...,Vibrio cholerae CVD 103-HGR strain live antigen
6226,6226,VO_0015068,833086,0,0.001916,Neisseria meningitidis serogroup Y capsular po...,"smallpox vaccine live, New York City Board of ..."
6088,6088,VO_0015068,1986826,0,0.001957,Neisseria meningitidis serogroup Y capsular po...,Shingrix
6328,6328,VO_0015068,1664639,0,0.001996,Neisseria meningitidis serogroup Y capsular po...,Flumist 2015-2016


In [232]:
res_dataset[['ID', 'VO_STR','RXN','RX_STR','PAIRED','LOSS']].to_csv(output_dir+'ALL_PREDS_1_'+cdt()+'.csv')

#### Result Evaluation:

In [108]:
test_data.reset_index().iloc[misidentified]['PAIRED'].value_counts()

PAIRED
0    463787
1         1
Name: count, dtype: int64

In [109]:
test_data.reset_index().iloc[misidentified]

Unnamed: 0,index,ID,RXN,PAIRED
4,4,VO_0003383,798305,1
86,86,VO_0003623,1292433,0
87,87,VO_0003623,2050794,0
88,88,VO_0003623,762597,0
89,89,VO_0003623,1928970,0
...,...,...,...,...
752609,752609,VO_0003493,1658710,0
752610,752610,VO_0003493,1657128,0
752611,752611,VO_0003493,1657223,0
752612,752612,VO_0003493,1946983,0


Check if in the test set there are correctly predicted "Identical" pairs. Also check the training set for this proportion.
Todo: Add negative samples to both training and testing. Stratify on the type of mapping (identical or not)
Neg >>> Pos

In [None]:
# save model
torch.save(ae_model.state_dict(), output_dir+'model_'+cdt()+'.pt')

In [110]:
missed_dataset = test_data.reset_index().iloc[misidentified]

In [111]:
missed_dataset['LOSS'] = miss_loss
missed_dataset

Unnamed: 0,index,ID,RXN,PAIRED,LOSS
4,4,VO_0003383,798305,1,0.002027
86,86,VO_0003623,1292433,0,0.001835
87,87,VO_0003623,2050794,0,0.001950
88,88,VO_0003623,762597,0,0.001797
89,89,VO_0003623,1928970,0,0.001836
...,...,...,...,...,...
752609,752609,VO_0003493,1658710,0,0.001784
752610,752610,VO_0003493,1657128,0,0.001916
752611,752611,VO_0003493,1657223,0,0.001867
752612,752612,VO_0003493,1946983,0,0.001838


In [112]:
missed_dataset['PAIRED'].value_counts()

PAIRED
0    463787
1         1
Name: count, dtype: int64

In [113]:
test_data['PAIRED'].value_counts()

PAIRED
0    752528
1        86
Name: count, dtype: int64

In [114]:
train_data['PAIRED'].value_counts()

PAIRED
1    199
Name: count, dtype: int64

In [115]:
missed_dataset['VO_STR'] = missed_dataset['ID'].map(VO_LOOKUP_STR)
missed_dataset['RX_STR'] = missed_dataset['RXN'].map(RX_LOOKUP_STR)

In [116]:
missed_dataset

Unnamed: 0,index,ID,RXN,PAIRED,LOSS,VO_STR,RX_STR
4,4,VO_0003383,798305,1,0.002027,"diphtheria toxoid vaccine, inactivated 5 UNT/ML","diphtheria toxoid vaccine, inactivated 13.4 UN..."
86,86,VO_0003623,1292433,0,0.001835,Fluzone Quadrivalent 2015-2016 vaccine Prefill...,varicella-zoster virus vaccine live (Oka-Merck...
87,87,VO_0003623,2050794,0,0.001950,Fluzone Quadrivalent 2015-2016 vaccine Prefill...,Afluria Quadrivalent 2018-2019 Injectable Product
88,88,VO_0003623,762597,0,0.001797,Fluzone Quadrivalent 2015-2016 vaccine Prefill...,Salmonella typhi Ty21a live antigen Delayed Re...
89,89,VO_0003623,1928970,0,0.001836,Fluzone Quadrivalent 2015-2016 vaccine Prefill...,Fluvirin 2017-2018
...,...,...,...,...,...,...,...
752609,752609,VO_0003493,1658710,0,0.001784,influenza B virus B/Phuket/3073/2013 antigen 5...,influenza B virus B/Phuket/3073/2013 antigen 0...
752610,752610,VO_0003493,1657128,0,0.001916,influenza B virus B/Phuket/3073/2013 antigen 5...,influenza A virus (H1N1) antigen
752611,752611,VO_0003493,1657223,0,0.001867,influenza B virus B/Phuket/3073/2013 antigen 5...,influenza A virus A/Christchurch/16/2010 (H1N1...
752612,752612,VO_0003493,1946983,0,0.001838,influenza B virus B/Phuket/3073/2013 antigen 5...,influenza B virus B/Phuket/3073/2013 antigen 1...


In [117]:
missed_dataset['PAIRED'].value_counts()

PAIRED
0    463787
1         1
Name: count, dtype: int64

In [None]:
# missed_dataset[['ID', 'VO_STR','RXN','RX_STR','PAIRED','LOSS']].to_csv(output_dir+'MISSED_PREDS_'+cdt()+'.csv')

### Notes:  
1. Remove identical values from the training set ✅
2. Incorporating unmapped pairs for testing ✅
3. Incorporating unmapped pairs for training + testing 
4. Train on similar but not related unmapped pairs (adacel vs infanrix) 
5. Testing needs to incorporate all vaccine RXNORM concepts (not just the ones present in VO) ✅
6. Testing needs to incorporate all vaccine VO concepts ✅
7. Sort predictions as:
    1. Existing
    2. New

# LLM Loading and Inference:

### Llama-2-7b-hf

In [None]:
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
save_path_7b = 'models/llama2-7b-hf/'

In [None]:
# Load model directly

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")

model.save_pretrained(save_path_7b)
tokenizer.save_pretrained(save_path_7b)

In [None]:
#Loading saved model
tokenizer = AutoTokenizer.from_pretrained(save_path_7b)
model_7b = AutoModelForCausalLM.from_pretrained(save_path_7b).to(device)

In [None]:
pipeline_7b = transformers.pipeline(
    task="text-generation",
    model=model_7b,
    torch_dtype=torch.float16,
    tokenizer=tokenizer, 
    device = device,
    # return_full_text = False,   
)

### Prompt Engineering:

In [None]:
def get_response(prompt:str,pipeline,tokenizer):
    sequences = pipeline(
    # prompt+'\n',
    prompt,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=1000,
    )
    for seq in sequences:
        print(f"Result: {seq['generated_text']}")
    return sequences

In [None]:
prompt = """<s>[INST] <<SYS>> 
Given that in the principles of ontology mapping, a concept should be mapped to the most similar concept. 
You are a vaccine domain specialist. Your task is to match the target concept to a concept in the candidate vocabulary. 
If a sufficient match is not present in the candidates you can suggest that a new concept be created to match the target concept.

<</SYS>>

[question]: For the target concept '0.5 ML influenza A virus A/Darwin/6/2021 (H3N2) antigen 0.09 MG/ML / influenza A virus A/Wisconsin/588/2019 (H1N1) antigen 0.09 MG/ML / influenza B virus B/Austria/1359417/2021 antigen 0.09 MG/ML / influenza B virus B/Phuket/3073/2013 antigen 0.09 MG/ML Prefilled Syringe [Flublok Quadrivalent 2022-2023]' what would be the best match ? 
Note that all provided options are closely related to the target concept. You need to find the closest possible match or alternatively suggest that a new concept be created
1. 'Flublok Quadrivalent 2018-2019 vaccine 0.5 ML Prefilled Syringe'
2. 'Flublok Quadrivalent 2017-2018 Injectable Product'
3. 'influenza A virus A/Michigan/45/2015 (H1N1) antigen 0.09 MG/ML / influenza A virus A/Singapore/INFIMH-16-0019/2016 (H3N2) antigen 0.09 MG/ML / influenza B virus B/Maryland/15/2016 antigen 0.09 MG/ML / influenza B virus B/Phuket/3073/2013 antigen 0.09 MG/ML [Flublok Quadrivalent 2018-2019]'
4. 'Fluarix Quadrivalent'
"[\INST]"
[answer]: 
"""

In [None]:
test_prompt = """<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>

How do I count from 1 to 10 in spanish? [/INST]
"""

In [None]:
# get_response(test_prompt, pipeline_7b)

In [None]:
# get_response(prompt, pipeline_7b)

In [None]:
sequences = pipeline(
    'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=200,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")


In [None]:
sequences = pipeline_7b(
    'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=200,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")


In [None]:
pipeline = transformers.pipeline(
    task="text-generation",
    model=model_7b,
    torch_dtype=torch.float16,
    tokenizer=tokenizer, 
    device = device,
    return_full_text = True,
    
)

In [None]:
prompt = f"<s>[INST] <<SYS>>Answer the question based on the context below. " + \
                "\n[context]: " + context + \
                "\n[question]: " + df.question[i] + \
                "[\INST]"

### Llama-2-7b-chat-hf

In [None]:
!ls

In [None]:
save_path_7b_chat = 'models/llama2-7b-chat-hf/'

In [None]:
!mkdir $save_path_7b_chat

In [None]:
# Load model directly

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

model.save_pretrained(save_path_7b_chat)
tokenizer.save_pretrained(save_path_7b_chat)

In [None]:
#Loading saved model
tokenizer_7b_chat = AutoTokenizer.from_pretrained(save_path_7b_chat)
model_7b_chat = AutoModelForCausalLM.from_pretrained(save_path_7b_chat,device_map="auto")
# .to(device)

In [None]:
pipeline_7b_chat = transformers.pipeline(
    task="text-generation",
    model=model_7b_chat,
    torch_dtype=torch.float16,
    tokenizer=tokenizer_7b_chat, 
    # device = device,
    # device_map = "auto"
    # return_full_text = False,   
)

In [None]:
# get_response(test_prompt, pipeline_7b_chat)

In [None]:
prompt

In [None]:
get_response(prompt, pipeline_7b_chat,tokenizer_7b_chat)

In [234]:
!ls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024_02_05_RXN_Concepts.csv  RXN_Concepts.csv	   VO_DF_FULL_TRIMMED.csv
ALL_UMLS_CONCEPTS.csv	     UMLS_CONCEPTS.csv	   VO_RXN[archived].ipynb
models			     UMLS_Vac_Terms.ipynb  VO_RXN.csv
outputs			     VO_DF_FULL.csv	   VO_RXN.ipynb
