In [30]:
import pandas as pd
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader

DATA = "/home/I6356345/project/data/triplets.tsv"

1. Custom Tokenization: Column Value = One Token

You need to build a tokenizer where each unique column value (entire concept, property, or related concept) is assigned a unique token ID. Here’s how to do that:

Step 1: Build a Vocabulary from Column Values
We'll create a dictionary where each unique value in the Concept, Property, and Related Concept columns gets assigned a unique token.

In [31]:
# Load TSV data
data = pd.read_csv(DATA, sep="\t")
# data

In [64]:
df = data.sample(n=100)
df = data[:10000]
# df=data
df.head()

Unnamed: 0,Concept,Property,Related Concept,tokenized_concept,tokenized_property,tokenized_related_concept,input_sequence
0,"1,4-alpha-Glucan branching enzyme",originals,"1,4-alpha-Glucan branching enzyme",[1],[1001],[1],"[1, 1001]"
1,Methylphenyltetrahydropyridine,originals,Methylphenyltetrahydropyridine,[2],[1001],[2],"[2, 1001]"
2,1-naphthylamine,originals,1-naphthylamine,[3],[1001],[3],"[3, 1001]"
3,17-hydroxycorticosteroid,originals,17-hydroxycorticosteroid,[4],[1001],[4],"[4, 1001]"
4,17-ketosteroid,originals,17-ketosteroid,[5],[1001],[5],"[5, 1001]"


In [65]:
# Create an empty vocabulary
vocab = {}
token_counter = 1  # Start token IDs from 1 (you can reserve 0 for padding if needed)

# Function to add unique column values to the vocab
def add_to_vocab(value):
    global token_counter
    if value not in vocab:
        vocab[value] = token_counter
        token_counter += 1

# Add all unique values from Concept, Property, and Related Concept to the vocabulary
df['Concept'].apply(add_to_vocab)
df['Property'].apply(add_to_vocab)
df['Related Concept'].apply(add_to_vocab)

# Let's check the created vocabulary
# print(vocab)

0       None
1       None
2       None
3       None
4       None
        ... 
9995    None
9996    None
9997    None
9998    None
9999    None
Name: Related Concept, Length: 10000, dtype: object

Step 2: Tokenize Data Based on the Vocabulary

Once you have the vocabulary, you can now map each column value to its corresponding token ID:

In [66]:
# Function to tokenize a column value based on the vocab
def tokenize(value):
    return [vocab[value]]  # Return token ID as a list to keep compatibility with batch processing

# Tokenize the Concept, Property, and Related Concept columns
df['tokenized_concept'] = df['Concept'].apply(tokenize)
df['tokenized_property'] = df['Property'].apply(tokenize)
df['tokenized_related_concept'] = df['Related Concept'].apply(tokenize)

# Combine tokenized concept and property into a single input sequence
df['input_sequence'] = df.apply(lambda row: row['tokenized_concept'] + row['tokenized_property'], axis=1)

# Show the tokenized data
print(df[['input_sequence', 'tokenized_related_concept']].head())


  input_sequence tokenized_related_concept
0      [1, 9999]                       [1]
1      [2, 9999]                       [2]
2      [3, 9999]                       [3]
3      [4, 9999]                       [4]
4      [5, 9999]                       [5]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pand

Step 3: Prepare Dataset for Training
Now you can create a Dataset class as before, but each "input sequence" will be a token representing the entire Concept and Property, and the target sequence will be the token representing the Related Concept.

In [67]:
class TripletDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input_sequence'].tolist()
        self.targets = df['tokenized_related_concept'].tolist()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_sequence = torch.tensor(self.inputs[idx], dtype=torch.long)
        target_sequence = torch.tensor(self.targets[idx], dtype=torch.long)
        return input_sequence, target_sequence

# Create dataset and data loader
dataset = TripletDataset(df)
train_loader = DataLoader(dataset, batch_size=8, shuffle=True)


4. Building and Training the Transformer Model
Now that the data is properly tokenized, the transformer model and training loop will remain mostly the same. Here's a reminder of how you set up the model:

In [68]:
import torch
import torch.nn as nn
import torch.optim as optim

class GPTLikeModel(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, num_layers, max_seq_len):
        super(GPTLikeModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = nn.Parameter(torch.zeros(1, max_seq_len, d_model))

        self.transformer_layers = nn.ModuleList([
            nn.TransformerDecoderLayer(d_model=d_model, nhead=n_heads) for _ in range(num_layers)
        ])
        
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        # Embedding and positional encoding
        seq_len = x.size(1)
        x = self.embedding(x) + self.positional_encoding[:, :seq_len, :]

        # Pass through each transformer decoder layer
        for layer in self.transformer_layers:
            x = layer(x, x)  # Decoder takes input twice in GPT-like models

        # Output layer
        logits = self.fc_out(x)
        return logits

# Define model hyperparameters
vocab_size = len(vocab) + 1  # Include 1 for padding (if needed)
d_model = 128  # Embedding size
n_heads = 4  # Number of attention heads
num_layers = 1  # Number of transformer layers
max_seq_len = 2  # Maximum sequence length (concept + property)

# Initialize the model
model = GPTLikeModel(vocab_size, d_model, n_heads, num_layers, max_seq_len)

# Move the model to GPU if available
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Define the optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()


5. Training the Model
Once the model is defined, you can begin training it. Below is the training loop that processes the input sequences and target sequences through the model and updates the model's parameters using backpropagation.

In [71]:
# Training and testing loop (memorization task)
epochs = 20
model.train()

for epoch in range(epochs):
    total_loss = 0
    model.train()  # Set model to training mode

    # Training on the same data
    for batch in tqdm(train_loader):
        inputs, targets = batch
        inputs = inputs.to(device)
        targets = targets.to(device)

        # Forward pass
        optimizer.zero_grad()
        outputs = model(inputs)

        # We only care about the first token in the output sequence
        outputs = outputs[:, 0, :]  # Shape becomes: (batch_size, vocab_size)

        targets = targets.view(-1)  # Flatten the targets

        # Compute loss
        loss = criterion(outputs, targets)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Training Loss: {total_loss / len(train_loader)}")

    # Testing on the same data (memorization check)
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(train_loader):  # Testing on the same dataset
            inputs, targets = batch
            inputs = inputs.to(device)
            targets = targets.to(device)

            outputs = model(inputs)
            outputs = outputs[:, 0, :]  # Only take the first token prediction
            predicted = torch.argmax(outputs, dim=1)

            total += targets.size(0)
            correct += (predicted == targets.view(-1)).sum().item()
#             print(total, correct)

    accuracy = 100 * correct / total
    print(f"Epoch {epoch + 1}, Memorization Accuracy: {accuracy:.5f}%")


100%|██████████████████████████████████████| 1250/1250 [00:08<00:00, 147.08it/s]


Epoch 1, Training Loss: 4.206340262413025


100%|██████████████████████████████████████| 1250/1250 [00:01<00:00, 643.56it/s]


Epoch 1, Memorization Accuracy: 46.93000%


100%|██████████████████████████████████████| 1250/1250 [00:08<00:00, 144.67it/s]


Epoch 2, Training Loss: 3.855147563934326


100%|██████████████████████████████████████| 1250/1250 [00:01<00:00, 644.83it/s]


Epoch 2, Memorization Accuracy: 54.50000%


100%|██████████████████████████████████████| 1250/1250 [00:08<00:00, 146.39it/s]


Epoch 3, Training Loss: 3.4911270669937133


100%|██████████████████████████████████████| 1250/1250 [00:02<00:00, 560.88it/s]


Epoch 3, Memorization Accuracy: 61.08000%


100%|██████████████████████████████████████| 1250/1250 [00:08<00:00, 145.31it/s]


Epoch 4, Training Loss: 3.086393120288849


100%|██████████████████████████████████████| 1250/1250 [00:02<00:00, 589.85it/s]


Epoch 4, Memorization Accuracy: 71.58000%


100%|██████████████████████████████████████| 1250/1250 [00:08<00:00, 143.45it/s]


Epoch 5, Training Loss: 2.7193598182678222


100%|██████████████████████████████████████| 1250/1250 [00:02<00:00, 533.97it/s]


Epoch 5, Memorization Accuracy: 77.75000%


100%|██████████████████████████████████████| 1250/1250 [00:08<00:00, 147.61it/s]


Epoch 6, Training Loss: 2.3468707821846007


100%|██████████████████████████████████████| 1250/1250 [00:01<00:00, 627.61it/s]


Epoch 6, Memorization Accuracy: 83.07000%


100%|██████████████████████████████████████| 1250/1250 [00:08<00:00, 148.04it/s]


Epoch 7, Training Loss: 1.9590189062356949


100%|██████████████████████████████████████| 1250/1250 [00:02<00:00, 587.74it/s]


Epoch 7, Memorization Accuracy: 87.55000%


100%|██████████████████████████████████████| 1250/1250 [00:08<00:00, 140.75it/s]


Epoch 8, Training Loss: 1.6330068887233735


100%|██████████████████████████████████████| 1250/1250 [00:01<00:00, 646.06it/s]


Epoch 8, Memorization Accuracy: 91.32000%


100%|██████████████████████████████████████| 1250/1250 [00:08<00:00, 145.44it/s]


Epoch 9, Training Loss: 1.3227414251685143


100%|██████████████████████████████████████| 1250/1250 [00:02<00:00, 607.26it/s]


Epoch 9, Memorization Accuracy: 93.16000%


100%|██████████████████████████████████████| 1250/1250 [00:08<00:00, 145.70it/s]


Epoch 10, Training Loss: 1.1345771447300912


100%|██████████████████████████████████████| 1250/1250 [00:01<00:00, 695.25it/s]


Epoch 10, Memorization Accuracy: 95.01000%


100%|██████████████████████████████████████| 1250/1250 [00:08<00:00, 149.67it/s]


Epoch 11, Training Loss: 0.9183353682994843


100%|██████████████████████████████████████| 1250/1250 [00:01<00:00, 703.20it/s]


Epoch 11, Memorization Accuracy: 95.50000%


100%|██████████████████████████████████████| 1250/1250 [00:08<00:00, 145.25it/s]


Epoch 12, Training Loss: 0.7772527645885945


100%|██████████████████████████████████████| 1250/1250 [00:01<00:00, 715.34it/s]


Epoch 12, Memorization Accuracy: 96.48000%


100%|██████████████████████████████████████| 1250/1250 [00:08<00:00, 142.30it/s]


Epoch 13, Training Loss: 0.6665569043725729


100%|██████████████████████████████████████| 1250/1250 [00:01<00:00, 731.25it/s]


Epoch 13, Memorization Accuracy: 97.03000%


100%|██████████████████████████████████████| 1250/1250 [00:08<00:00, 145.58it/s]


Epoch 14, Training Loss: 0.5648019399374723


100%|██████████████████████████████████████| 1250/1250 [00:01<00:00, 712.43it/s]


Epoch 14, Memorization Accuracy: 97.58000%


100%|██████████████████████████████████████| 1250/1250 [00:08<00:00, 149.88it/s]


Epoch 15, Training Loss: 0.5137719763278961


100%|██████████████████████████████████████| 1250/1250 [00:01<00:00, 705.40it/s]


Epoch 15, Memorization Accuracy: 97.79000%


100%|██████████████████████████████████████| 1250/1250 [00:08<00:00, 148.49it/s]


Epoch 16, Training Loss: 0.4435343907639384


100%|██████████████████████████████████████| 1250/1250 [00:01<00:00, 691.12it/s]


Epoch 16, Memorization Accuracy: 98.19000%


100%|██████████████████████████████████████| 1250/1250 [00:08<00:00, 147.61it/s]


Epoch 17, Training Loss: 0.4109196829227731


100%|██████████████████████████████████████| 1250/1250 [00:01<00:00, 682.75it/s]


Epoch 17, Memorization Accuracy: 98.50000%


100%|██████████████████████████████████████| 1250/1250 [00:08<00:00, 150.16it/s]


Epoch 18, Training Loss: 0.4019919766925275


100%|██████████████████████████████████████| 1250/1250 [00:01<00:00, 661.38it/s]


Epoch 18, Memorization Accuracy: 98.46000%


100%|██████████████████████████████████████| 1250/1250 [00:08<00:00, 144.81it/s]


Epoch 19, Training Loss: 0.3497099509354681


100%|██████████████████████████████████████| 1250/1250 [00:02<00:00, 591.33it/s]


Epoch 19, Memorization Accuracy: 98.63000%


100%|██████████████████████████████████████| 1250/1250 [00:08<00:00, 144.42it/s]


Epoch 20, Training Loss: 0.33248605472203346


100%|██████████████████████████████████████| 1250/1250 [00:01<00:00, 664.41it/s]

Epoch 20, Memorization Accuracy: 98.65000%





In [24]:
predicted

tensor([7208, 7208, 7208, 7208, 7208, 7208, 7208, 7208], device='cuda:0')