<a href="https://colab.research.google.com/github/tayademadhuri/IE643_Project/blob/main/Small_data_Squad_weights_Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Install requirements and  Libraries**

In [2]:
!pip install transformers torch datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import torch
import torch.nn as nn
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from transformers import default_data_collator

In [4]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [5]:
import evaluate
from datasets import load_dataset

In [6]:
# Load Pre-trained Transformer Model and Tokenizer

model_name = "bert-base-uncased"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Use MOE

In [7]:
# Access and Print Original FFN Weights (Wk, Wv)
def print_ffn_weights(layer_idx):
    ffn_layer = model.bert.encoder.layer[layer_idx].intermediate.dense
    print(f"Original FFN Weights for Layer {layer_idx}:")
    print("Wk (FFN weight matrix):", ffn_layer.weight.data)
    print("Wv (FFN value matrix):", ffn_layer.bias.data)

layer_idx = 11  # last layer, example
print_ffn_weights(layer_idx)

# Define MoE Adapter Layer as Per Algorithm
class MoEAdapterLayer(nn.Module):
    def __init__(self, input_dim, expert_dim=256, num_experts=4, top_k=2):
        super(MoEAdapterLayer, self).__init__()
        self.experts_WA = nn.ModuleList([nn.Linear(input_dim, expert_dim) for _ in range(num_experts)])
        self.experts_WB = nn.ModuleList([nn.Linear(expert_dim, input_dim) for _ in range(num_experts)])
        self.router_Wg = nn.Linear(input_dim, num_experts)
        self.top_k = top_k

    def forward(self, h):
        batch_size = h.size(0)
        gating_probs = torch.softmax(self.router_Wg(h), dim=-1)  # Step: Wg(h)
        selected_experts = gating_probs.topk(self.top_k, dim=-1).indices  # Select top K experts per input
        activation = torch.zeros(batch_size, h.size(-1)).to(h.device)  # Initialize the activation

        for batch_idx, expert_indices in enumerate(selected_experts):
            expert_activations = []
            for expert_idx in expert_indices:
                expert_WA = self.experts_WA[expert_idx]
                expert_WB = self.experts_WB[expert_idx]
                ai = torch.relu(expert_WA(h[batch_idx]))  # Sparse activation: ReLU(h * WAi)
                expert_activations.append(expert_WB(ai))  # WB(ai)

            # Gather and sum activations
            activation[batch_idx] = torch.sum(torch.stack(expert_activations), dim=0)
        return activation + h  # Return F(h) + Adapter Output

# Add MoE Adapter to FFN Layer
def add_moe_adapter_to_ffn(model, layer_idx):
    ffn_layer = model.bert.encoder.layer[layer_idx].intermediate.dense
    input_dim = ffn_layer.in_features
    adapter_layer = MoEAdapterLayer(input_dim)
    model.bert.encoder.layer[layer_idx].intermediate.moe_adapter = adapter_layer
    print(f"Adapter with MoE added to FFN layer {layer_idx}.")

# Apply adapter to layer 11 as an example
add_moe_adapter_to_ffn(model, layer_idx)

# Print Modified FFN Weights (WA, WB)
print("FFN weights after adding adapter:")
print(model.bert.encoder.layer[layer_idx].intermediate.moe_adapter.experts_WA[0].weight.data)
print(model.bert.encoder.layer[layer_idx].intermediate.moe_adapter.experts_WB[0].weight.data)

# Save Modified Model
model.save_pretrained("modified_transformer_with_moe_adapter")
tokenizer.save_pretrained("modified_transformer_with_moe_adapter")


Original FFN Weights for Layer 11:
Wk (FFN weight matrix): tensor([[ 0.0587,  0.0547, -0.0215,  ...,  0.0323,  0.0472, -0.0258],
        [-0.0112, -0.0313, -0.0500,  ...,  0.0239,  0.0018,  0.0107],
        [ 0.0021,  0.0415, -0.0370,  ...,  0.0286,  0.0118, -0.0310],
        ...,
        [ 0.0422,  0.0793,  0.0202,  ...,  0.0414, -0.0607,  0.0131],
        [-0.0060,  0.0452,  0.0393,  ..., -0.0117,  0.0522,  0.0407],
        [-0.0437, -0.0512,  0.0376,  ...,  0.0037,  0.0824, -0.0069]])
Wv (FFN value matrix): tensor([-0.0976, -0.0618, -0.0515,  ..., -0.1151, -0.0466, -0.1224])
Adapter with MoE added to FFN layer 11.
FFN weights after adding adapter:
tensor([[ 0.0113,  0.0031,  0.0078,  ..., -0.0343, -0.0231,  0.0068],
        [ 0.0098,  0.0013, -0.0066,  ...,  0.0307, -0.0173, -0.0353],
        [ 0.0128,  0.0216, -0.0233,  ...,  0.0352, -0.0052,  0.0291],
        ...,
        [ 0.0158,  0.0358,  0.0023,  ...,  0.0338,  0.0006,  0.0054],
        [ 0.0074, -0.0075, -0.0061,  ..., -0.027

('modified_transformer_with_moe_adapter/tokenizer_config.json',
 'modified_transformer_with_moe_adapter/special_tokens_map.json',
 'modified_transformer_with_moe_adapter/vocab.txt',
 'modified_transformer_with_moe_adapter/added_tokens.json',
 'modified_transformer_with_moe_adapter/tokenizer.json')

In [8]:
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForQuestionAnswering

# Load modified model and tokenizer
model_path = "modified_transformer_with_moe_adapter"
model = AutoModelForQuestionAnswering.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)


Some weights of the model checkpoint at modified_transformer_with_moe_adapter were not used when initializing BertForQuestionAnswering: ['bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.0.bias', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.0.weight', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.1.bias', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.1.weight', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.2.bias', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.2.weight', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.3.bias', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.3.weight', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WB.0.bias', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WB.0.weight', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WB.1.bias', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WB.1.weight', 'bert.encoder.layer.11.intermediate

**Dataset**

In [9]:
import evaluate
from datasets import load_dataset, DatasetDict # Import DatasetDict

# Load the SQuAD dataset
dataset1 = load_dataset("squad")

# Define the desired subset size (10%)
subset_size = 0.1

# Function to create a subset of a given split
def create_subset(split_name):
    num_samples = int(len(dataset1[split_name]) * subset_size)
    subset = dataset1[split_name].shuffle(seed=42).select(range(num_samples))
    return subset

# Create subsets for training and validation
train_subset = create_subset("train")
validation_subset = create_subset("validation")

# Create a new DatasetDict with the subsets
# Use DatasetDict constructor instead of dataset.from_dict
dataset = DatasetDict({
    "train": train_subset,
    "validation": validation_subset
})

# Print some info about the subsets
print(f"Original training set size: {len(dataset1['train'])}")
print(f"Training subset size: {len(train_subset)}")
print(f"Original validation set size: {len(dataset1['validation'])}")
print(f"Validation subset size: {len(validation_subset)}")

# Use the evaluate library to load the SQuAD metric (if needed)
metric = evaluate.load("squad")

README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Original training set size: 87599
Training subset size: 8759
Original validation set size: 10570
Validation subset size: 1057


Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 8759
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 1057
    })
})

In [11]:
for i, example in enumerate(dataset["train"]):
    if len(tokenizer(example["question"], example["context"])["input_ids"]) > 384:
        break
example = dataset["train"][i]
example

{'id': '56fb7d7a8ddada1400cd6479',
 'title': 'Middle_Ages',
 'context': "Under the Capetian dynasty France slowly began to expand its authority over the nobility, growing out of the Île-de-France to exert control over more of the country in the 11th and 12th centuries. They faced a powerful rival in the Dukes of Normandy, who in 1066 under William the Conqueror (duke 1035–1087), conquered England (r. 1066–87) and created a cross-channel empire that lasted, in various forms, throughout the rest of the Middle Ages. Normans also settled in Sicily and southern Italy, when Robert Guiscard (d. 1085) landed there in 1059 and established a duchy that later became the Kingdom of Sicily. Under the Angevin dynasty of Henry II (r. 1154–89) and his son Richard I (r. 1189–99), the kings of England ruled over England and large areas of France,[W] brought to the family by Henry II's marriage to Eleanor of Aquitaine (d. 1204), heiress to much of southern France.[X] Richard's younger brother John (r. 11

In [12]:
len(tokenizer(example["question"], example["context"])["input_ids"])

422

In [13]:
def preprocess_function(examples):
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        truncation=True,
        max_length=384,
        # stride=128, # Remove stride or set to 0 to avoid overlapping tokens
        stride=0,  # Setting stride to 0 avoids overlapping tokens
        return_overflowing_tokens=False, # Set to False to avoid extra tokens
        padding="max_length"
    )
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/8759 [00:00<?, ? examples/s]

Map:   0%|          | 0/1057 [00:00<?, ? examples/s]

In [14]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8759
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1057
    })
})

Using: transformer: bert-base-uncased

In [15]:
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForQuestionAnswering

# Load modified model and tokenizer
model_path = "bert-base-uncased"
model = AutoModelForQuestionAnswering.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
import evaluate
from datasets import load_dataset, DatasetDict # Import DatasetDict

# Load the SQuAD dataset
dataset1 = load_dataset("squad")

# Define the desired subset size (10%)
subset_size = 0.1

# Function to create a subset of a given split
def create_subset(split_name):
    num_samples = int(len(dataset1[split_name]) * subset_size)
    subset = dataset1[split_name].shuffle(seed=42).select(range(num_samples))
    return subset

# Create subsets for training and validation
train_subset = create_subset("train")
validation_subset = create_subset("validation")

# Create a new DatasetDict with the subsets
# Use DatasetDict constructor instead of dataset.from_dict
dataset = DatasetDict({
    "train": train_subset,
    "validation": validation_subset
})

# Print some info about the subsets
print(f"Original training set size: {len(dataset1['train'])}")
print(f"Training subset size: {len(train_subset)}")
print(f"Original validation set size: {len(dataset1['validation'])}")
print(f"Validation subset size: {len(validation_subset)}")

# Use the evaluate library to load the SQuAD metric (if needed)
metric = evaluate.load("squad")


Original training set size: 87599
Training subset size: 8759
Original validation set size: 10570
Validation subset size: 1057


In [17]:
metric

EvaluationModule(name: "squad", module_type: "metric", features: {'predictions': {'id': Value(dtype='string', id=None), 'prediction_text': Value(dtype='string', id=None)}, 'references': {'id': Value(dtype='string', id=None), 'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None)}}, usage: """
Computes SQuAD scores (F1 and EM).
Args:
    predictions: List of question-answers dictionaries with the following key-values:
        - 'id': id of the question-answer pair as given in the references (see below)
        - 'prediction_text': the text of the answer
    references: List of question-answers dictionaries with the following key-values:
        - 'id': id of the question-answer pair (see above),
        - 'answers': a Dict in the SQuAD dataset format
            {
                'text': list of possible texts for the answer, as a list of strings
                'answer_start': list of start positions for 

In [18]:
def preprocess_function(examples):
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        truncation=True,
        max_length=384,
        # stride=128, # Remove stride or set to 0 to avoid overlapping tokens
        stride=0,  # Setting stride to 0 avoids overlapping tokens
        return_overflowing_tokens=False, # Set to False to avoid extra tokens
        padding="max_length"
    )
    return inputs

tokenized_dataset = train_subset.map(preprocess_function, batched=True)

Map:   0%|          | 0/8759 [00:00<?, ? examples/s]

**Fine tunned Squad dataset using Bert base model**

In [26]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from datasets import load_dataset


#2.  Load the SQuAD dataset
dataset1 = load_dataset("squad")

# Define the desired subset size (10%)
subset_size = 0.1

# Function to create a subset of a given split
def create_subset(split_name):
    num_samples = int(len(dataset1[split_name]) * subset_size)
    subset = dataset1[split_name].shuffle(seed=42).select(range(num_samples))
    return subset

# Create subsets for training and validation
train_subset = create_subset("train")
validation_subset = create_subset("validation")

# Create a new DatasetDict with the subsets
# Use DatasetDict constructor instead of dataset.from_dict
dataset = DatasetDict({
    "train": train_subset,
    "validation": validation_subset
})

# 3. Initialize tokenizer and model
#model_name = "modified_transformer_with_moe_adapter"
model_name = "distilbert-base-uncased"  # or any other model like "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# 4. Preprocess data for question answering
def preprocess(example):
    questions = [q.strip() for q in example["question"]]
    contexts = [c.strip() for c in example["context"]]
    answers = example["answers"]

    # Tokenize inputs
    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation=True,
        padding="max_length",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )

    # Map start and end positions
    offset_mapping = tokenized_examples.pop("offset_mapping")
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Retrieve corresponding sample and answer
        sample_index = sample_mapping[i]
        answer = answers[sample_index]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        # Find the start and end token positions
        token_start_index, token_end_index = 0, len(input_ids) - 1
        while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
            token_start_index += 1
        token_start_index -= 1
        while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
            token_end_index -= 1
        token_end_index += 1

        # Add bounds check for token_start_index and token_end_index
        token_start_index = max(0, min(token_start_index, len(offsets) - 1))
        token_end_index = max(0, min(token_end_index, len(offsets) - 1))

        # Assign positions or assign CLS if no answer
        if offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char:
            tokenized_examples["start_positions"].append(token_start_index)
            tokenized_examples["end_positions"].append(token_end_index)
        else:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)

    return tokenized_examples

# Apply preprocessing to the dataset
tokenized_datasets = dataset.map(preprocess, batched=True, remove_columns=dataset["train"].column_names)

# 5. Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=10_000,
    save_total_limit=2,
)

# 6. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

# 7. Start training
trainer.train()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8759 [00:00<?, ? examples/s]

Map:   0%|          | 0/1057 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.0004,0.000166
2,0.0002,7.8e-05
3,0.0001,5.8e-05


TrainOutput(global_step=3318, training_loss=0.030461231103439766, metrics={'train_runtime': 1122.6106, 'train_samples_per_second': 23.645, 'train_steps_per_second': 2.956, 'total_flos': 2601041915437056.0, 'train_loss': 0.030461231103439766, 'epoch': 3.0})

In [27]:
# Evaluate on dataset
squad_results = trainer.evaluate()

print(f"Squad Evaluation results: {squad_results}")

Squad Evaluation results: {'eval_loss': 5.845135456183925e-05, 'eval_runtime': 13.7821, 'eval_samples_per_second': 78.145, 'eval_steps_per_second': 9.795, 'epoch': 3.0}


In [34]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [39]:
from google.colab import drive
drive.mount('/content/drive')

# Define save path in Google Drive
#model_save_path = '/content/drive/My Drive/finetuning/squad_weights_finetunned_bert'
model_save_path = '/content/drive/My Drive/finetuning/squad/squad_finetunned_bert'

# Save the fine-tuned model and tokenizer to Google Drive
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model and tokenizer saved to {model_save_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model and tokenizer saved to /content/drive/My Drive/finetuning/squad/squad_finetunned_bert


Checking model performance

In [45]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# Load pre-trained model and tokenizer
#model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
model_name ="/content/drive/My Drive/finetuning/squad/squad_finetunned_bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Define question and context
question = "What is the capital of France?"
context = "France is a country in Europe. The capital of France is Paris."

# Tokenize inputs
inputs = tokenizer(
    question,
    context,
    truncation=True,
    padding="max_length",
    max_length=384,
    return_tensors="pt"
)

# Get predictions
model.eval()
with torch.no_grad():
    outputs = model(**inputs)

# Extract start and end logits
start_logits = outputs.start_logits
end_logits = outputs.end_logits

# Get start and end indices
start_index = torch.argmax(start_logits, dim=1).item()
end_index = torch.argmax(end_logits, dim=1).item()

# Decode the predicted answer
if start_index <= end_index:  # Ensure valid indices
    answer_tokens = inputs["input_ids"][0][start_index:end_index + 1]
    predicted_answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
else:
    predicted_answer = "Unable to find a valid answer."

print(f"Question: {question}")
print(f"Context: {context}")
print(f"Predicted Answer: {predicted_answer}")


Question: What is the capital of France?
Context: France is a country in Europe. The capital of France is Paris.
Predicted Answer: 


#fine tunned model using bert based uncased


In [46]:
import torch
from transformers import BertModel, BertTokenizer

# Initialize BERT model
model = BertModel.from_pretrained('/content/drive/My Drive/finetuning/squad/squad_finetunned_bert')
tokenizer = BertTokenizer.from_pretrained('/content/drive/My Drive/finetuning/squad/squad_finetunned_bert')

# Example input
input_text = "Hello, how are you?"
inputs = tokenizer(input_text, return_tensors="pt")

# Move model to GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Check memory before forward pass
print(f"Memory allocated before forward pass: {torch.cuda.memory_allocated(device)} bytes")

# Forward pass
outputs = model(**inputs.to(device))

# Check memory after forward pass
print(f"Memory allocated after forward pass: {torch.cuda.memory_allocated(device)} bytes")


You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertModel were not initialized from the model checkpoint at /content/drive/My Drive/finetuning/squad/squad_finetunned_bert and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermed

Memory allocated before forward pass: 1257196032 bytes
Memory allocated after forward pass: 1262012416 bytes


In [47]:
import psutil
import os

# Get current process
process = psutil.Process(os.getpid())

# Get memory usage in MB
print(f"Memory usage: {process.memory_info().rss / 1024 / 1024} MB")


Memory usage: 2405.84375 MB


**Fine Tunned Squad dataset using modified weights transformer**

In [None]:
# 1. Install dependencies
# !pip install transformers datasets

import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from datasets import load_dataset

#2.  Load the SQuAD dataset
dataset1 = load_dataset("squad")

# Define the desired subset size (10%)
subset_size = 0.1

# Function to create a subset of a given split
def create_subset(split_name):
    num_samples = int(len(dataset1[split_name]) * subset_size)
    subset = dataset1[split_name].shuffle(seed=42).select(range(num_samples))
    return subset

# Create subsets for training and validation
train_subset = create_subset("train")
validation_subset = create_subset("validation")

# Create a new DatasetDict with the subsets
# Use DatasetDict constructor instead of dataset.from_dict
dataset = DatasetDict({
    "train": train_subset,
    "validation": validation_subset
})

# 3. Initialize tokenizer and model
model_name = "modified_transformer_with_moe_adapter"
#model_name = "distilbert-base-uncased"  # or any other model like "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# 4. Preprocess data for question answering
def preprocess(example):
    questions = [q.strip() for q in example["question"]]
    contexts = [c.strip() for c in example["context"]]
    answers = example["answers"]

    # Tokenize inputs
    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation=True,
        padding="max_length",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )

    # Map start and end positions
    offset_mapping = tokenized_examples.pop("offset_mapping")
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Retrieve corresponding sample and answer
        sample_index = sample_mapping[i]
        answer = answers[sample_index]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        # Find the start and end token positions
        token_start_index, token_end_index = 0, len(input_ids) - 1
        while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
            token_start_index += 1
        token_start_index -= 1
        while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
            token_end_index -= 1
        token_end_index += 1

        # Add bounds check for token_start_index and token_end_index
        token_start_index = max(0, min(token_start_index, len(offsets) - 1))
        token_end_index = max(0, min(token_end_index, len(offsets) - 1))

        # Assign positions or assign CLS if no answer
        if offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char:
            tokenized_examples["start_positions"].append(token_start_index)
            tokenized_examples["end_positions"].append(token_end_index)
        else:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)

    return tokenized_examples

# Apply preprocessing to the dataset
tokenized_datasets = dataset.map(preprocess, batched=True, remove_columns=dataset["train"].column_names)

# 5. Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=10_000,
    save_total_limit=2,
)

# 6. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

# 7. Start training
trainer.train()



Some weights of the model checkpoint at modified_transformer_with_moe_adapter were not used when initializing BertForQuestionAnswering: ['bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.0.bias', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.0.weight', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.1.bias', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.1.weight', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.2.bias', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.2.weight', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.3.bias', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WA.3.weight', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WB.0.bias', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WB.0.weight', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WB.1.bias', 'bert.encoder.layer.11.intermediate.moe_adapter.experts_WB.1.weight', 'bert.encoder.layer.11.intermediate

Map:   0%|          | 0/8759 [00:00<?, ? examples/s]

Map:   0%|          | 0/1057 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.0003,8.3e-05
2,0.0001,4.7e-05
3,0.0001,3.8e-05


TrainOutput(global_step=3318, training_loss=0.02926873161124583, metrics={'train_runtime': 2065.4424, 'train_samples_per_second': 12.851, 'train_steps_per_second': 1.606, 'total_flos': 5201895833100288.0, 'train_loss': 0.02926873161124583, 'epoch': 3.0})

In [None]:
#wandb key :6d725493110a315b8e9782d2e8f21458277d93d5

In [None]:
# Evaluate on dataset
squad_results = trainer.evaluate()

print(f"Squad Evaluation results: {squad_results}")

Squad Evaluation results: {'eval_loss': 3.8051610317779705e-05, 'eval_runtime': 22.3331, 'eval_samples_per_second': 48.224, 'eval_steps_per_second': 6.045, 'epoch': 3.0}


In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

from google.colab import drive
drive.mount('/content/drive')
# Define the save path in Google Drive

model_save_path = '/content/drive/My Drive/finetuning/squad/squad_weights_bert'

# Load the saved model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_save_path)
model = AutoModelForQuestionAnswering.from_pretrained(model_save_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def answer_question(question, context):
    # Encode the inputs for the model
    inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=384)

    # Pass the inputs through the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the start and end logits
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Get the most probable start and end positions
    start_idx = torch.argmax(start_logits)
    end_idx = torch.argmax(end_logits) + 1  # +1 to include the end token

    # Decode the answer from the input IDs
    answer_tokens = inputs['input_ids'][0][start_idx:end_idx]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)

    return answer


In [None]:
question = "What are the symptoms of COVID-19?"
context = "COVID-19 symptoms include fever, cough, and difficulty breathing among others."

answer = answer_question(question, context)
print("Answer:", answer)


Answer: 


Checking Memory Usage During Model Initialization

In [None]:
import torch
from transformers import BertModel, BertTokenizer

# Initialize BERT model
model = BertModel.from_pretrained('/content/drive/My Drive/finetuning/squad/squad_weights_bert')
tokenizer = BertTokenizer.from_pretrained('/content/drive/My Drive/finetuning/squad/squad_weights_bert')

# Example input
input_text = "Hello, how are you?"
inputs = tokenizer(input_text, return_tensors="pt")

# Move model to GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Check memory before forward pass
print(f"Memory allocated before forward pass: {torch.cuda.memory_allocated(device)} bytes")

# Forward pass
outputs = model(**inputs.to(device))

# Check memory after forward pass
print(f"Memory allocated after forward pass: {torch.cuda.memory_allocated(device)} bytes")


Use psutil (For CPU Memory) This can give the memory usage on the system level. You can monitor the memory usage of your process while loading or fine-tuning a model.

In [49]:
import psutil
import os

# Get current process
process = psutil.Process(os.getpid())

# Get memory usage in MB
print(f"Memory usage: {process.memory_info().rss / 1024 / 1024} MB")

Memory usage: 2080.984375 MB


Checking model performance

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# Load pre-trained model and tokenizer
#model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
model_name ="/content/drive/My Drive/finetuning/squad/squad_weights_bert" # Removed the trailing single quote
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Define question and context
question = "What is the capital of France?"
context = "France is a country in Europe. The capital of France is Paris."

# Tokenize inputs
inputs = tokenizer(
    question,
    context,
    truncation=True,
    padding="max_length",
    max_length=384,
    return_tensors="pt"
)

# Get predictions
model.eval()
with torch.no_grad():
    outputs = model(**inputs)

# Extract start and end logits
start_logits = outputs.start_logits
end_logits = outputs.end_logits

# Get start and end indices
start_index = torch.argmax(start_logits, dim=1).item()
end_index = torch.argmax(end_logits, dim=1).item()

# Decode the predicted answer
if start_index <= end_index:  # Ensure valid indices
    answer_tokens = inputs["input_ids"][0][start_index:end_index + 1]
    predicted_answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
else:
    predicted_answer = "Unable to find a valid answer."

print(f"Question: {question}")
print(f"Context: {context}")
print(f"Predicted Answer: {predicted_answer}")


Question: What is the capital of France?
Context: France is a country in Europe. The capital of France is Paris.
Predicted Answer: 


In [1]:
!pip install datasets transformers

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# Load pre-trained model and tokenizer
model_name ="/content/drive/My Drive/finetuning/squad/squad_weights_bert"
#model_name ="bert-large-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Define question and context
question = "What is the capital of France?"
context = "France is a country in Europe. The capital of France is Paris."

# Tokenize inputs
inputs = tokenizer(
    question,
    context,
    truncation=True,
    padding="max_length",
    max_length=384,
    return_tensors="pt"
)

# Get predictions
model.eval()
with torch.no_grad():
    outputs = model(**inputs)

# Extract start and end logits
start_logits = outputs.start_logits
end_logits = outputs.end_logits

# Get start and end indices
start_index = torch.argmax(start_logits, dim=1).item()
end_index = torch.argmax(end_logits, dim=1).item()

# --- Debugging ---
print(f"Start index: {start_index}")
print(f"End index: {end_index}")
print(f"Start logits: {start_logits}")
print(f"End logits: {end_logits}")
# --- End Debugging ---

# Decode the predicted answer
# Adjust logic to handle cases where end_index is before start_index
if start_index <= end_index:
    answer_tokens = inputs["input_ids"][0][start_index : end_index + 1]
    predicted_answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
else:
    # If end_index is before start_index, consider the highest start_logit as the answer
    start_index = torch.argmax(start_logits).item()
    predicted_answer = tokenizer.decode(inputs["input_ids"][0][start_index], skip_special_tokens=True)
    print("Warning: End index before start index. Using highest start logit as answer.")

print(f"Question: {question}")
print(f"Context: {context}")
print(f"Predicted Answer: {predicted_answer}")

Start index: 0
End index: 0
Start logits: tensor([[ 8.0861, -7.1891, -7.7306, -7.8118, -7.8425, -7.8792, -7.4537, -6.0438,
         -6.1798, -7.8643, -7.8966, -7.9340, -7.9262, -7.9059, -7.9165, -7.8833,
         -7.9115, -7.9525, -7.9296, -7.9443, -7.8704, -7.9511, -7.8848, -6.1849,
         -7.9678, -7.9705, -7.9677, -7.9687, -7.9728, -7.9731, -7.9704, -7.9666,
         -7.9619, -7.9679, -7.9648, -7.9645, -7.9613, -7.9501, -7.9638, -7.9684,
         -7.9663, -7.9649, -7.9617, -7.9646, -7.9660, -7.9594, -7.9568, -7.9560,
         -7.9552, -7.9600, -7.9590, -7.9559, -7.9611, -7.9669, -7.9709, -7.9723,
         -7.9655, -7.9642, -7.9703, -7.9690, -7.9683, -7.9537, -7.9422, -7.9618,
         -7.9458, -7.9631, -7.9574, -7.9483, -7.9692, -7.9693, -7.9650, -7.9666,
         -7.9627, -7.9618, -7.9597, -7.9518, -7.9505, -7.9570, -7.9591, -7.9627,
         -7.9640, -7.9620, -7.9690, -7.9728, -7.9700, -7.9685, -7.9627, -7.9695,
         -7.9734, -7.9653, -7.9624, -7.9584, -7.9560, -7.9658, -7.9

Testing using our model

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

from google.colab import drive
drive.mount('/content/drive')
# Define the save path in Google Drive
model_save_path = '/content/drive/My Drive/finetuning/squad_weights_finetunned_bert'

# Load the saved model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_save_path)
model = AutoModelForQuestionAnswering.from_pretrained(model_save_path)

Mounted at /content/drive


In [None]:
def answer_question(question, context):
    # Encode the inputs for the model using 'pt' for PyTorch tensors
    inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=384)

    # Pass the inputs through the model
    outputs = model(**inputs)

    # Extract the start and end logits
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Get the most probable start and end positions
    start_idx = torch.argmax(start_logits, dim=-1)  # Use torch.argmax for PyTorch
    end_idx = torch.argmax(end_logits, dim=-1) + 1  # Use torch.argmax and +1 for PyTorch

    # Decode the answer from the input IDs
    answer_tokens = inputs['input_ids'][0][start_idx[0]:end_idx[0]]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)

    return answer

In [None]:
question = "What are the symptoms of COVID-19?"
context = "COVID-19 symptoms include fever, cough, and difficulty breathing among others."

answer = answer_question(question, context)
print("Answer:", answer)

Answer: 


In [None]:
#wandb key :6d725493110a315b8e9782d2e8f21458277d93d5