In [10]:
%load_ext nb_black

import pickle
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from collections import defaultdict
from datetime import datetime
from pprint import pprint
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

DATA = "/home/I6356345/project/data/triplets.tsv"
DATA_TOKENIZED_TO = "/home/I6356345/project/data/tokenized_100k.tsv"
SEED = 566

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [2]:
# Fix random seeds for reproducibility
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

<IPython.core.display.Javascript object>

## Read and tokenize dataset


In [3]:
def tokenize_columns(df, columns):
    # Create an empty vocabulary
    vocab = {}
    token_counter = 1  # Start token IDs from 1 (you can reserve 0 for padding if needed)

    # Function to add unique column values to the vocab
    def add_to_vocab(value):
        nonlocal token_counter
        if value not in vocab:
            vocab[value] = token_counter
            token_counter += 1

    # Add all unique values from the specified columns to the vocabulary
    for column in columns:
        df[column].apply(add_to_vocab)

    # Function to tokenize a column value based on the vocab
    def tokenize(value):
        return [vocab[value]]  # Return token ID as a list to keep compatibility with batch processing

    # Tokenize the specified columns
    for column in columns:
        df[f'tokenized_{column.lower().replace(" ", "_")}'] = df[column].apply(tokenize)

    # Combine tokenized concept and property into a single input sequence
    df['input_sequence'] = df.apply(lambda row: row['tokenized_concept'] + row['tokenized_property'], axis=1)

    return df, vocab


class TripletDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input_sequence'].tolist()
        self.targets = df['tokenized_related_concept'].tolist()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_sequence = torch.tensor(self.inputs[idx], dtype=torch.long)
        target_sequence = torch.tensor(self.targets[idx], dtype=torch.long)
        return input_sequence, target_sequence


class GPTLikeModel(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, num_layers, max_seq_len, seed=42):
        super(GPTLikeModel, self).__init__()
        # Fix random seed for reproducibility
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(seed)
            torch.cuda.manual_seed_all(seed)

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = nn.Parameter(torch.zeros(1, max_seq_len, d_model))

        self.transformer_layers = nn.ModuleList(
            [
                nn.TransformerDecoderLayer(d_model=d_model, nhead=n_heads)
                for _ in range(num_layers)
            ]
        )

        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        # Embedding and positional encoding
        seq_len = x.size(1)
        x = self.embedding(x) + self.positional_encoding[:, :seq_len, :]

        # Pass through each transformer decoder layer
        for layer in self.transformer_layers:
            x = layer(x, x)  # Decoder takes input twice in GPT-like models

        # Output layer
        logits = self.fc_out(x)
        return logits


<IPython.core.display.Javascript object>

In [4]:
# Load TSV data
data = pd.read_csv(DATA, sep="\t")
df = data.sample(n=100000, random_state=SEED)
# df = data[:100000]
df.head()

Unnamed: 0,Concept,Property,Related Concept
1473772,ECG: U wave exaggerated,case_significance_id,900000000000017005
3289696,Talazoparib (as talazoparib tosylate) 1 mg ora...,definition_status_id,900000000000073002
471179,Pentostatin-containing product,definition_status_id,900000000000073002
670735,Fistulectomy of rectum,type_id,900000000000013009
693462,Excision of neurofibroma of cutaneous nerve,has_direct_procedure_site,Peripheral nerve structure


<IPython.core.display.Javascript object>

In [5]:
columns_to_tokenize = ["Concept", "Property", "Related Concept"]

df_tokenized, vocab = tokenize_columns(df, columns_to_tokenize)
display(df.head())
print("First 10 rows of Vocabulary:")
pprint(dict(list(vocab.items())[:10]))

Unnamed: 0,Concept,Property,Related Concept,tokenized_concept,tokenized_property,tokenized_related_concept,input_sequence
1473772,ECG: U wave exaggerated,case_significance_id,900000000000017005,[1],[88028],[88218],"[1, 88028]"
3289696,Talazoparib (as talazoparib tosylate) 1 mg ora...,definition_status_id,900000000000073002,[2],[88029],[88219],"[2, 88029]"
471179,Pentostatin-containing product,definition_status_id,900000000000073002,[3],[88029],[88219],"[3, 88029]"
670735,Fistulectomy of rectum,type_id,900000000000013009,[4],[88030],[88220],"[4, 88030]"
693462,Excision of neurofibroma of cutaneous nerve,has_direct_procedure_site,Peripheral nerve structure,[5],[88031],[88221],"[5, 88031]"


First 10 rows of Vocabulary:
{'Acute Q wave infarction - anterolateral': 6,
 'Blood group antibody P^k^': 8,
 'Cluster of differentiation antigen 45 R': 10,
 'ECG: U wave exaggerated': 1,
 'Excision of neurofibroma of cutaneous nerve': 5,
 'Fistulectomy of rectum': 4,
 'Ondansetron-containing product in parenteral dose form': 9,
 'Pentostatin-containing product': 3,
 'Stromeyer-Little operation hepatotomy': 7,
 'Talazoparib (as talazoparib tosylate) 1 mg oral capsule': 2}


<IPython.core.display.Javascript object>

In [11]:
df.to_csv(DATA_TOKENIZED_TO, sep="\t", index=False)

<IPython.core.display.Javascript object>

# MODEL

## MODEL.SAVE

In [15]:
! ls ../../../data/out_models

models_20241023_101847_init_trans.pkl
models_20241023_101900_init_trans.pkl
models_20241023_102044_init_trans.pkl
models_20241025_153126_init_trans.pkl
models_20241027_223358_init_trans.pkl
models_20241030_002102_50_100k_trans.pkl
models_20241030_020035_50_100k_trans.pkl
models_20241030_035655_50_100k_trans.pkl
models_20241030_061020_50_100k_trans.pkl
models_20241030_084050_50_100k_trans.pkl
models_20241030_112648_50_100k_trans.pkl
models_20241105_062400_50_100k_trans.pkl
models_20241105_183211_init_trans.pkl
models_20241105_222648_50_100k_trans.pkl
models_20241106_160815_50_100k_trans.pkl
models_20241107_120404_50_100k_trans.pkl


<IPython.core.display.Javascript object>

In [18]:
with open(
    "../../../data/out_models/models_20241030_112648_50_100k_trans.pkl", "rb"
) as file:
    models = pickle.load(file)
print(models[100000])

GPTLikeModel(
  (embedding): Embedding(112457, 128)
  (transformer_layers): ModuleList(
    (0): TransformerDecoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (multihead_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (linear1): Linear(in_features=128, out_features=2048, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=2048, out_features=128, bias=True)
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (norm3): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
      (dropout3): Dropout(p=0.1, inplace=False)
    )
  )
  (fc_out): Linear(in_features=12

<IPython.core.display.Javascript object>

In [19]:
torch.save(
    models[100000].state_dict(),
    "../../../data/out_models/model_20241030_112648_100k.pt",
)

<IPython.core.display.Javascript object>

## MODEL.LOAD

In [23]:
## then later:
# model hyperparameters
vocab_size = len(vocab) + 1  # Include 1 for padding (if needed)
d_model = 128  # Embedding size
n_heads = 4  # Number of attention heads
num_layers = 1  # Number of transformer layers
max_seq_len = 2  # Maximum sequence length (concept + property)
batch_size = 64
lr = 0.001
model = GPTLikeModel(vocab_size, d_model, n_heads, num_layers, max_seq_len, seed=SEED)
model.load_state_dict(
    torch.load("../../../data/out_models/model_20241030_112648_100k.pt")
)
model

GPTLikeModel(
  (embedding): Embedding(112457, 128)
  (transformer_layers): ModuleList(
    (0): TransformerDecoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (multihead_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (linear1): Linear(in_features=128, out_features=2048, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=2048, out_features=128, bias=True)
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (norm3): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
      (dropout3): Dropout(p=0.1, inplace=False)
    )
  )
  (fc_out): Linear(in_features=12

<IPython.core.display.Javascript object>