# Imports, constants

In [11]:
%load_ext nb_black

# imports
import pickle
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from collections import defaultdict
from datetime import datetime
from pprint import pprint
from tqdm import tqdm
from copy import deepcopy
import itertools
import os

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset


# constants
DATA = "../../../data/triplets.tsv"
SEED = 566
# Results and models file paths
RESULTS_FILE = (
    "../../../data/out_metrics/results_{timestamp}_lay_act_{config_index}.pkl"
)
MODELS_FILE = "../../../data/out_models/models_{timestamp}_lay_act_{config_index}.pkl"


# fix random seeds for reproducibility
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

# All Functions

In [12]:
# ===============================
# For data
# ===============================


def tokenize_columns(df_in, columns):
    df = deepcopy(df_in)
    # Create an empty vocabulary
    vocab = {}
    token_counter = (
        1  # Start token IDs from 1 (you can reserve 0 for padding if needed)
    )

    # Function to add unique column values to the vocab
    def add_to_vocab(value):
        nonlocal token_counter
        if value not in vocab:
            vocab[value] = token_counter
            token_counter += 1

    # Add all unique values from the specified columns to the vocabulary
    for column in columns:
        df[column].apply(add_to_vocab)

    # Function to tokenize a column value based on the vocab
    def tokenize(value):
        return [
            vocab[value]
        ]  # Return token ID as a list to keep compatibility with batch processing

    # Tokenize the specified columns
    for column in columns:
        df[f'tokenized_{column.lower().replace(" ", "_")}'] = df[column].apply(tokenize)

    # Combine tokenized concept and property into a single input sequence
    df["input_sequence"] = df.apply(
        lambda row: row["tokenized_concept"] + row["tokenized_property"], axis=1
    )

    return df, vocab


class TripletDataset(Dataset):
    def __init__(self, df):
        self.inputs = df["input_sequence"].tolist()
        self.targets = df["tokenized_related_concept"].tolist()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_sequence = torch.tensor(self.inputs[idx], dtype=torch.long)
        target_sequence = torch.tensor(self.targets[idx], dtype=torch.long)
        return input_sequence, target_sequence


# ===============================
# Model
# ===============================


# Define the custom activation functions
def get_activation_function(name):
    if name == "default":
        return nn.ReLU()
    elif name == "GELU":
        return nn.GELU()
    elif name == "RAF":
        return nn.RReLU()
    elif name == "softmax":
        return nn.Softmax(dim=-1)
    else:
        raise ValueError(f"Unknown activation function: {name}")


# Modified model class to include dynamic activation function and adaptable hidden size
class GPTLikeModel(nn.Module):
    def __init__(
        self,
        vocab_size,
        d_model,
        n_heads,
        num_layers,
        max_seq_len,
        activation_fn,
        seed=42,
    ):
        super(GPTLikeModel, self).__init__()
        # Fix random seed for reproducibility
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(seed)
            torch.cuda.manual_seed_all(seed)

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = nn.Parameter(torch.zeros(1, max_seq_len, d_model))

        self.transformer_layers = nn.ModuleList(
            [
                nn.TransformerDecoderLayer(
                    d_model=d_model, nhead=n_heads, activation=activation_fn
                )
                for _ in range(num_layers)
            ]
        )

        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        # Embedding and positional encoding
        seq_len = x.size(1)
        x = self.embedding(x) + self.positional_encoding[:, :seq_len, :]

        # Pass through each transformer decoder layer
        for layer in self.transformer_layers:
            x = layer(x, x)  # Decoder takes input twice in GPT-like models

        # Output layer
        logits = self.fc_out(x)
        return logits


# other logic


# Define a constant for the filename format
# Example function to save results
def save_results(results, filename):
    with open(filename, "wb") as f:
        pickle.dump(results, f)
    print(f"Results saved to {filename}")

<IPython.core.display.Javascript object>

## Read and tokenize dataset


In [13]:
# Load TSV data
data = pd.read_csv(DATA, sep="\t")
df = data.sample(n=200000, random_state=SEED)
columns_to_tokenize = ["Concept", "Property", "Related Concept"]
df_tokenized, vocab = tokenize_columns(df, columns_to_tokenize)

<IPython.core.display.Javascript object>

## Train and save

In [15]:
# Experiment parameters
n_values = [50000, 100000]
activation_functions = ["default", "GELU", "RAF", "softmax"]
n_layers_values = [1, 2, 4]

# Model hyperparameters
vocab_size = len(vocab) + 1  # Include 1 for padding (if needed)
d_model = 128  # Embedding size
n_heads = 4  # Number of attention heads
max_seq_len = 2  # Maximum sequence length (concept + property)
batch_size = 128
lr = 0.001
epochs = 5

# Calculate the number of parameters to keep the total number constant
base_num_layers = 1
base_d_model = d_model
base_num_params = base_d_model * base_num_layers

# Prepare the list of all possible configurations using Cartesian product
configurations = list(
    itertools.product(n_values, activation_functions, n_layers_values)
)

<IPython.core.display.Javascript object>

In [4]:
# Save configurations to a file
config_filename = "../../../data/configs/experiment_configs.pkl"
os.makedirs(os.path.dirname(config_filename), exist_ok=True)
with open(config_filename, "wb") as f:
    pickle.dump(configurations, f)
    print(f"Configurations saved to {config_filename}")

print(len(configurations))

# Load configurations and determine starting point
start_index, end_index = int(0), int(2)

results = defaultdict(list)
final_models = {}
num_iterations = 1

# Iterate through configurations starting from the specified index
for config_index in range(start_index, end_index):
    n, activation_fn_name, n_layers = configurations[config_index]
    adjusted_d_model = int(base_num_params / n_layers)
    activation_fn = get_activation_function(activation_fn_name)

    for iteration in range(num_iterations):
        results_for_n = []  # Create a separate list for each n value
        print(
            f"Training with n={n}, activation={activation_fn_name}, layers={n_layers}, iteration={iteration + 1}"
        )
        iteration_seed = SEED + iteration

        # Prepare dataset and data loader
        dataset = TripletDataset(df_tokenized.sample(n=n, random_state=iteration_seed))
        train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        # Initialize the model with adjusted d_model and activation function
        model = GPTLikeModel(
            vocab_size,
            adjusted_d_model,
            n_heads,
            n_layers,
            max_seq_len,
            activation_fn,
            seed=iteration_seed,
        )

        # Move the model to GPU if available
        device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")
        model = model.to(device)

        # Define the optimizer and loss function
        optimizer = optim.Adam(model.parameters(), lr=lr)
        criterion = nn.CrossEntropyLoss()

        # RUN LAUNCHING
        model.train()

        for epoch in range(epochs):
            total_loss = 0
            model.train()  # Set model to training mode

            # Training on the same data
            for batch in tqdm(train_loader):
                inputs, targets = batch
                inputs = inputs.to(device)
                targets = targets.to(device)

                # Forward pass
                optimizer.zero_grad()  # can be placed anywhere before loss.backward
                outputs = model(inputs)

                # We only care about the first token in the output sequence
                outputs = outputs[:, 0, :]  # Shape becomes: (batch_size, vocab_size)

                targets = targets.view(-1)  # Flatten the targets

                # Compute loss
                loss = criterion(outputs, targets)

                # Backward pass and optimization
                loss.backward()
                optimizer.step()

                total_loss += loss.item()

            print(f"Epoch {epoch + 1}, Training Loss: {total_loss / len(train_loader)}")
            if epoch % 2 == 0:
                continue  # skip 0,2,4... (e.g. if n_iters=100, so we plot 1,3,..99 (99th is the last))
            # Testing on the same data (memorization check)
            model.eval()  # Set model to evaluation mode
            correct = 0
            total = 0
            with torch.no_grad():
                for batch in tqdm(train_loader):  # Testing on the same dataset
                    inputs, targets = batch
                    inputs = inputs.to(device)
                    targets = targets.to(device)

                    outputs = model(inputs)
                    outputs = outputs[:, 0, :]  # Only take the first token prediction
                    predicted = torch.argmax(outputs, dim=1)

                    total += targets.size(0)
                    correct += (predicted == targets.view(-1)).sum().item()
            #             print(total, correct)

            accuracy = 100 * correct / total
            print(f"Epoch {epoch + 1}, Memorization Accuracy: {accuracy:.5f}%")
            results_for_n.append(accuracy)
        # Save the final model for this iteration
        final_models[(n, activation_fn_name, n_layers, iteration)] = model
        # Save all accuracies for the current n value
        results[(n, activation_fn_name, n_layers)].append(results_for_n)

    # Add timestamp to filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    indexes = f"{start_index}_{end_index}"
    # Pickle the results and final models dictionaries
    save_results(
        results,
        RESULTS_FILE.format(**{"timestamp": timestamp, "config_index": indexes}),
    )
    save_results(
        final_models,
        MODELS_FILE.format(**{"timestamp": timestamp, "config_index": indexes}),
    )

Configurations saved to ../../../data/configs/experiment_configs.pkl
24
Training with n=50000, activation=default, layers=1, iteration=1


100%|█████████████████████████████████████████| 391/391 [00:08<00:00, 46.02it/s]


Epoch 1, Training Loss: 8.201990501959916


100%|█████████████████████████████████████████| 391/391 [00:08<00:00, 48.50it/s]


Epoch 2, Training Loss: 6.982691484339097


100%|████████████████████████████████████████| 391/391 [00:01<00:00, 274.74it/s]


Epoch 2, Memorization Accuracy: 10.00000%


100%|█████████████████████████████████████████| 391/391 [00:08<00:00, 48.52it/s]


Epoch 3, Training Loss: 6.752909404237557


100%|█████████████████████████████████████████| 391/391 [00:08<00:00, 48.42it/s]


Epoch 4, Training Loss: 6.193129549246005


100%|████████████████████████████████████████| 391/391 [00:01<00:00, 272.98it/s]


Epoch 4, Memorization Accuracy: 13.14400%


100%|█████████████████████████████████████████| 391/391 [00:08<00:00, 48.48it/s]


Epoch 5, Training Loss: 4.832230626469683
Results saved to ../../../data/out_metrics/results_20241118_221136_lay_act_0_2.pkl
Results saved to ../../../data/out_models/models_20241118_221136_lay_act_0_2.pkl
Training with n=50000, activation=default, layers=2, iteration=1


100%|█████████████████████████████████████████| 391/391 [00:06<00:00, 58.06it/s]


Epoch 1, Training Loss: 8.292995662640427


100%|█████████████████████████████████████████| 391/391 [00:06<00:00, 58.05it/s]


Epoch 2, Training Loss: 6.964269340495624


100%|████████████████████████████████████████| 391/391 [00:01<00:00, 256.54it/s]


Epoch 2, Memorization Accuracy: 9.93800%


100%|█████████████████████████████████████████| 391/391 [00:06<00:00, 58.09it/s]


Epoch 3, Training Loss: 6.798611595807478


100%|█████████████████████████████████████████| 391/391 [00:06<00:00, 58.12it/s]


Epoch 4, Training Loss: 6.653622833359272


100%|████████████████████████████████████████| 391/391 [00:01<00:00, 256.86it/s]


Epoch 4, Memorization Accuracy: 12.16200%


100%|█████████████████████████████████████████| 391/391 [00:06<00:00, 58.11it/s]


Epoch 5, Training Loss: 6.263756831283764
Results saved to ../../../data/out_metrics/results_20241118_221213_lay_act_0_2.pkl
Results saved to ../../../data/out_models/models_20241118_221213_lay_act_0_2.pkl


<IPython.core.display.Javascript object>

In [25]:
import pickle

with open(
    "../../../data/out_models/models_20241119_083532_lay_act_0_6.pkl", "rb"
) as file:
    results = dict(pickle.load(file))



<IPython.core.display.Javascript object>

In [26]:
results

{(50000,
  'default',
  1,
  0): GPTLikeModel(
   (embedding): Embedding(200353, 128)
   (transformer_layers): ModuleList(
     (0): TransformerDecoderLayer(
       (self_attn): MultiheadAttention(
         (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
       )
       (multihead_attn): MultiheadAttention(
         (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
       )
       (linear1): Linear(in_features=128, out_features=2048, bias=True)
       (dropout): Dropout(p=0.1, inplace=False)
       (linear2): Linear(in_features=2048, out_features=128, bias=True)
       (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
       (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
       (norm3): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
       (dropout1): Dropout(p=0.1, inplace=False)
       (dropout2): Dropout(p=0.1, inplace=False)
       (dropout3): Dropout(p=0.1, inplace

<IPython.core.display.Javascript object>

In [16]:
# Save configurations to a file
config_filename = "../../../data/configs/experiment_configs.pkl"
os.makedirs(os.path.dirname(config_filename), exist_ok=True)
with open(config_filename, "wb") as f:
    pickle.dump(configurations, f)
    print(f"Configurations saved to {config_filename}")

print(len(configurations))

# Load configurations and determine starting point
start_index, end_index = int(0), int(2)

results = defaultdict(list)
final_models = {}
num_iterations = 1

# Iterate through configurations starting from the specified index
for config_index in range(start_index, end_index):
    n, activation_fn_name, n_layers = configurations[config_index]
    adjusted_d_model = int(base_num_params / n_layers)
    activation_fn = get_activation_function(activation_fn_name)

    for iteration in range(num_iterations):
        break

Configurations saved to ../../../data/configs/experiment_configs.pkl
24


<IPython.core.display.Javascript object>

In [34]:
model = results[(50000, "default", 1, 1)]
device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model

GPTLikeModel(
  (embedding): Embedding(200353, 128)
  (transformer_layers): ModuleList(
    (0): TransformerDecoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (multihead_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (linear1): Linear(in_features=128, out_features=2048, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=2048, out_features=128, bias=True)
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (norm3): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
      (dropout3): Dropout(p=0.1, inplace=False)
      (activation): ReLU()
    )
  )
  (fc_

<IPython.core.display.Javascript object>

In [36]:
# Testing on the same data (memorization check)
model.eval()  # Set model to evaluation mode
correct = 0
total = 0
with torch.no_grad():
    for batch in tqdm(train_loader):  # Testing on the same dataset
        inputs, targets = batch
        inputs = inputs.to(device)
        targets = targets.to(device)

        outputs = model(inputs)
        outputs = outputs[:, 0, :]  # Only take the first token prediction
        predicted = torch.argmax(outputs, dim=1)

        total += targets.size(0)
        correct += (predicted == targets.view(-1)).sum().item()
#             print(total, correct)

accuracy = 100 * correct / total
print(f"Epoch {epoch + 1}, Memorization Accuracy: {accuracy}%")

100%|████████████████████████████████████████| 782/782 [00:02<00:00, 368.23it/s]

Epoch 2, Memorization Accuracy: 0.0%





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>