<a href="https://colab.research.google.com/github/tayademadhuri/modified_transformer_with_moe_adapter_squad/blob/main/modified_transformer_with_moe_adapter_squad_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers torch datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from transformers import default_data_collator

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
import evaluate
from datasets import load_dataset

In [None]:
# Load Pre-trained Transformer Model and Tokenizer
model_name = "bert-base-uncased"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [None]:
# Access and Print Original FFN Weights (Wk, Wv)
def print_ffn_weights(layer_idx):
    ffn_layer = model.bert.encoder.layer[layer_idx].intermediate.dense
    print(f"Original FFN Weights for Layer {layer_idx}:")
    print("Wk (FFN weight matrix):", ffn_layer.weight.data)
    print("Wv (FFN value matrix):", ffn_layer.bias.data)

layer_idx = 11  # last layer, example
print_ffn_weights(layer_idx)

# Define MoE Adapter Layer as Per Algorithm
class MoEAdapterLayer(nn.Module):
    def __init__(self, input_dim, expert_dim=256, num_experts=4, top_k=2):
        super(MoEAdapterLayer, self).__init__()
        self.experts_WA = nn.ModuleList([nn.Linear(input_dim, expert_dim) for _ in range(num_experts)])
        self.experts_WB = nn.ModuleList([nn.Linear(expert_dim, input_dim) for _ in range(num_experts)])
        self.router_Wg = nn.Linear(input_dim, num_experts)
        self.top_k = top_k

    def forward(self, h):
        batch_size = h.size(0)
        gating_probs = torch.softmax(self.router_Wg(h), dim=-1)  # Step: Wg(h)
        selected_experts = gating_probs.topk(self.top_k, dim=-1).indices  # Select top K experts per input
        activation = torch.zeros(batch_size, h.size(-1)).to(h.device)  # Initialize the activation

        for batch_idx, expert_indices in enumerate(selected_experts):
            expert_activations = []
            for expert_idx in expert_indices:
                expert_WA = self.experts_WA[expert_idx]
                expert_WB = self.experts_WB[expert_idx]
                ai = torch.relu(expert_WA(h[batch_idx]))  # Sparse activation: ReLU(h * WAi)
                expert_activations.append(expert_WB(ai))  # WB(ai)

            # Gather and sum activations
            activation[batch_idx] = torch.sum(torch.stack(expert_activations), dim=0)
        return activation + h  # Return F(h) + Adapter Output

# Add MoE Adapter to FFN Layer
def add_moe_adapter_to_ffn(model, layer_idx):
    ffn_layer = model.bert.encoder.layer[layer_idx].intermediate.dense
    input_dim = ffn_layer.in_features
    adapter_layer = MoEAdapterLayer(input_dim)
    model.bert.encoder.layer[layer_idx].intermediate.moe_adapter = adapter_layer
    print(f"Adapter with MoE added to FFN layer {layer_idx}.")

# Apply adapter to layer 11 as an example
add_moe_adapter_to_ffn(model, layer_idx)

# Print Modified FFN Weights (WA, WB)
print("FFN weights after adding adapter:")
print(model.bert.encoder.layer[layer_idx].intermediate.moe_adapter.experts_WA[0].weight.data)
print(model.bert.encoder.layer[layer_idx].intermediate.moe_adapter.experts_WB[0].weight.data)

# Save Modified Model
model.save_pretrained("modified_transformer_with_moe_adapter")
tokenizer.save_pretrained("modified_transformer_with_moe_adapter")

Original FFN Weights for Layer 11:
Wk (FFN weight matrix): tensor([[ 0.0587,  0.0547, -0.0215,  ...,  0.0323,  0.0472, -0.0258],
        [-0.0112, -0.0313, -0.0500,  ...,  0.0239,  0.0018,  0.0107],
        [ 0.0021,  0.0415, -0.0370,  ...,  0.0286,  0.0118, -0.0310],
        ...,
        [ 0.0422,  0.0793,  0.0202,  ...,  0.0414, -0.0607,  0.0131],
        [-0.0060,  0.0452,  0.0393,  ..., -0.0117,  0.0522,  0.0407],
        [-0.0437, -0.0512,  0.0376,  ...,  0.0037,  0.0824, -0.0069]])
Wv (FFN value matrix): tensor([-0.0976, -0.0618, -0.0515,  ..., -0.1151, -0.0466, -0.1224])
Adapter with MoE added to FFN layer 11.
FFN weights after adding adapter:
tensor([[ 0.0255,  0.0288,  0.0107,  ..., -0.0071,  0.0256, -0.0330],
        [ 0.0083, -0.0199,  0.0309,  ..., -0.0268,  0.0170,  0.0208],
        [-0.0213,  0.0029,  0.0253,  ...,  0.0227,  0.0066,  0.0313],
        ...,
        [-0.0140, -0.0336, -0.0017,  ...,  0.0177,  0.0061,  0.0077],
        [-0.0110, -0.0109,  0.0247,  ..., -0.035

('modified_transformer_with_moe_adapter/tokenizer_config.json',
 'modified_transformer_with_moe_adapter/special_tokens_map.json',
 'modified_transformer_with_moe_adapter/vocab.txt',
 'modified_transformer_with_moe_adapter/added_tokens.json',
 'modified_transformer_with_moe_adapter/tokenizer.json')

In [None]:
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForQuestionAnswering

# Load modified model and tokenizer
model_path = "modified_transformer_with_moe_adapter"
model = AutoModelForQuestionAnswering.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

Some weights of the model checkpoint at modified_transformer_with_moe_adapter were not used when initializing BertForQuestionAnswering: ['bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.0.bias', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.0.weight', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.1.bias', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.1.weight', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.2.bias', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.2.weight', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.3.bias', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.3.weight', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WB.0.bias', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WB.0.weight', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WB.1.bias', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WB.1.weight', 'bert.encoder.layer.11.intermediate

In [None]:
import evaluate
from datasets import load_dataset

# Load and Preprocess SQuAD Dataset
dataset = load_dataset("squad")

# Use the evaluate library to load the SQuAD metric
metric = evaluate.load("squad")

README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        truncation=True,
        max_length=384,
        # stride=128, # Remove stride or set to 0 to avoid overlapping tokens
        stride=0,  # Setting stride to 0 avoids overlapping tokens
        return_overflowing_tokens=False, # Set to False to avoid extra tokens
        padding="max_length"
    )
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
def count_trainable_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def count_activated_neurons(model, inputs):
    activation_counts = 0
    def hook_fn(module, input, output):
        nonlocal activation_counts
        activation_counts += (output > 0).sum().item()  # Counting ReLU activations as "activated" neurons

    hooks = []
    for layer in model.bert.encoder.layer:
        hooks.append(layer.intermediate.moe_adapter.experts_WA[0].register_forward_hook(hook_fn))  # Monitor activations in MoE WA

    # Forward pass to count activations
    _ = model(**inputs)
    for hook in hooks:
        hook.remove()

    return activation_counts

# Example usage to count parameters and activations
trainable_params = count_trainable_params(model)
print(f"Number of trainable parameters: {trainable_params}")

Number of trainable parameters: 108893186


In [None]:
import torch

# Move model to GPU and use torch.cuda.memory_allocated to check memory usage
model.to("cuda")
memory_before = torch.cuda.memory_allocated()

# Select only the required input features for the model
inputs = {k: v for k, v in tokenized_dataset["validation"][:8].items() if k in ["input_ids", "attention_mask", "token_type_ids"]}

# Convert the lists to PyTorch tensors
inputs = {k: torch.tensor(v).to("cuda") for k, v in inputs.items()}

outputs = model(**inputs)  # Run a sample batch
memory_after = torch.cuda.memory_allocated()
print(f"Memory used by the model: {(memory_after - memory_before) / (1024 ** 2):.2f} MB")

Memory used by the model: 1760.99 MB


In [None]:
memory_before

436713984

In [None]:
memory_after

2283248128

#Check for the meomory using bert-base-uncased

In [None]:
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForQuestionAnswering

# Load modified model and tokenizer
model_path = "bert-base-uncased"
model = AutoModelForQuestionAnswering.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import evaluate
from datasets import load_dataset

# Load and Preprocess SQuAD Dataset
dataset = load_dataset("squad")

# Use the evaluate library to load the SQuAD metric
metric = evaluate.load("squad")

In [None]:
def preprocess_function(examples):
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        truncation=True,
        max_length=384,
        # stride=128, # Remove stride or set to 0 to avoid overlapping tokens
        stride=0,  # Setting stride to 0 avoids overlapping tokens
        return_overflowing_tokens=False, # Set to False to avoid extra tokens
        padding="max_length"
    )
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
def count_trainable_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def count_activated_neurons(model, inputs):
    activation_counts = 0
    def hook_fn(module, input, output):
        nonlocal activation_counts
        activation_counts += (output > 0).sum().item()  # Counting ReLU activations as "activated" neurons

    hooks = []
    for layer in model.bert.encoder.layer:
        hooks.append(layer.intermediate.moe_adapter.experts_WA[0].register_forward_hook(hook_fn))  # Monitor activations in MoE WA

    # Forward pass to count activations
    _ = model(**inputs)
    for hook in hooks:
        hook.remove()

    return activation_counts

# Example usage to count parameters and activations
trainable_params = count_trainable_params(model)
print(f"Number of trainable parameters: {trainable_params}")

Number of trainable parameters: 108893186


In [None]:
memory_after

2283248128

In [None]:
memory_before

436713984

# Use new model on Squad Dataset

In [None]:
# 1. Install dependencies (Uncomment if not installed)
# !pip install transformers datasets

import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from datasets import load_dataset

# 2. Load SQuAD dataset
dataset = load_dataset("squad")

# 3. Initialize tokenizer and model
model_name = "modified_transformer_with_moe_adapter"
#model_name = "distilbert-base-uncased"  # or any other model like "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# 4. Preprocess data for question answering
def preprocess(example):
    questions = [q.strip() for q in example["question"]]
    contexts = [c.strip() for c in example["context"]]
    answers = example["answers"]

    # Tokenize inputs
    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation=True,
        padding="max_length",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )

    # Map start and end positions
    offset_mapping = tokenized_examples.pop("offset_mapping")
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Retrieve corresponding sample and answer
        sample_index = sample_mapping[i]
        answer = answers[sample_index]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        # Find the start and end token positions
        token_start_index, token_end_index = 0, len(input_ids) - 1
        while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
            token_start_index += 1
        token_start_index -= 1
        while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
            token_end_index -= 1
        token_end_index += 1

        # Add bounds check for token_start_index and token_end_index
        token_start_index = max(0, min(token_start_index, len(offsets) - 1))
        token_end_index = max(0, min(token_end_index, len(offsets) - 1))

        # Assign positions or assign CLS if no answer
        if offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char:
            tokenized_examples["start_positions"].append(token_start_index)
            tokenized_examples["end_positions"].append(token_end_index)
        else:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)

    return tokenized_examples

# Apply preprocessing to the dataset
tokenized_datasets = dataset.map(preprocess, batched=True, remove_columns=dataset["train"].column_names)

# 5. Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    save_steps=10_000,
    save_total_limit=2,
)

# 6. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

# 7. Start training
trainer.train()


Some weights of the model checkpoint at modified_transformer_with_moe_adapter were not used when initializing BertForQuestionAnswering: ['bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.0.bias', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.0.weight', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.1.bias', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.1.weight', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.2.bias', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.2.weight', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.3.bias', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.3.weight', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WB.0.bias', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WB.0.weight', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WB.1.bias', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WB.1.weight', 'bert.encoder.layer.11.intermediate

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss


In [None]:
!pip install datasets --upgrade



# Save model and tokenizer locally

In [None]:
model.save_pretrained("squad_trained_model1")
tokenizer.save_pretrained("squad_trained_model1")


('squad_trained_model1/tokenizer_config.json',
 'squad_trained_model1/special_tokens_map.json',
 'squad_trained_model1/vocab.txt',
 'squad_trained_model1/added_tokens.json',
 'squad_trained_model1/tokenizer.json')

In [None]:
from google.colab import drive
drive.mount('/content/drive')  # Mount Google Drive to '/content/drive'
torch.save(model, '/content/drive/My Drive/squad_trained_model1.pt')  # Update the path

Mounted at /content/drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')

#Inference:

*   Memory requirement using modified_transformer_with_moe_adapter(bert-base-uncased modified) transofermer: 2283272704
*   Memory requirement using bert-base-uncased transformer: 4558627840

