In [1]:
pip install accelerate -U



In [2]:
pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/7.9 MB[0m [31m3.4 MB/s[0m eta [36m0:00:03[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/7.9 MB[0m [31m52.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.9/7.9 MB[0m [31m86.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m64.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m72.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sa

In [3]:
# import the necessary libraries
import numpy as np
import pandas as pd
from google.colab import drive
import torch
import torch.nn as nn
import transformers
from transformers import BertForMultipleChoice, BertTokenizer, AdamW
from torch.utils.data import DataLoader, TensorDataset

In [4]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
# Load data from .npy file
data = np.load("/content/gdrive/MyDrive/sentence_puzzle.npy", allow_pickle=True)[()]
len(data)

627

In [6]:
# Split data into training and validation
split_ratio = 0.8

total_samples = len(data)
split_index = int(total_samples * split_ratio)

training_data = data[:split_index]
validation_data = data[split_index:]

# Extract questions, options, and correct_indices for training and validation sets
questions = [entry['question'] for entry in training_data]
options = [entry['choice_list'] for entry in training_data]
correct_indices = [entry['label'] for entry in training_data]

validation_questions = [entry['question'] for entry in validation_data]
validation_options = [entry['choice_list'] for entry in validation_data]
validation_correct_indices = [entry['label'] for entry in validation_data]

In [7]:
len(training_data)

501

In [8]:
len(options)

501

In [9]:
import numpy as np
import torch

seed = 42  # You can choose any seed value
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7f62476da3b0>

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:
from transformers import AutoTokenizer
import numpy as np

# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize questions and options for training set
tokenized_training_data = []
for i in range(len(questions)):
    choices = options[i]
    input_ids = []
    attention_mask = []

    for choice in choices:
        # Combine the question and choice
        text = questions[i] + " " + choice

        # Tokenize and encode the text
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

        input_ids.append(inputs['input_ids'])
        attention_mask.append(inputs['attention_mask'])

    tokenized_training_data.append({
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'correct_index': correct_indices[i]
    })

# Tokenize questions and options for validation set
tokenized_validation_data = []
for i in range(len(validation_questions)):
    choices = validation_options[i]
    input_ids = []
    attention_mask = []

    for choice in choices:
        # Combine the question and choice
        text = validation_questions[i] + " " + choice

        # Tokenize and encode the text
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

        input_ids.append(inputs['input_ids'])
        attention_mask.append(inputs['attention_mask'])

    tokenized_validation_data.append({
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'correct_index': validation_correct_indices[i]
    })

# Save tokenized data as numpy arrays
np.save('tokenized_training_data.npy', tokenized_training_data)
np.save('tokenized_validation_data.npy', tokenized_validation_data)

In [12]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch
import numpy as np

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        # Extract questions, options, and correct indices
        input_ids = np.array([feature["input_ids"] for feature in features])
        attention_mask = np.array([feature["attention_mask"] for feature in features])
        labels = np.array([feature["correct_index"] for feature in features])

        batch_size = len(input_ids)
        num_choices = input_ids.shape[1]

        # Flatten features
        flattened_features = {
            "input_ids": input_ids.reshape(batch_size * num_choices, -1),
            "attention_mask": attention_mask.reshape(batch_size * num_choices, -1),
        }

        # Pad and reshape
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        # Reshape the batch
        batch = {
            k: v.view(batch_size, num_choices, -1) for k, v in batch.items()
        }
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [13]:
from transformers import AutoTokenizer

# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize questions and options for training set
tokenized_training_data = []
max_sequence_length = 128  # Define your desired sequence length

for i in range(len(questions)):
    choices = options[i]
    input_ids = []
    attention_mask = []

    for choice in choices:
        # Combine the question and choice
        text = questions[i] + " " + choice

        # Tokenize and encode the text with padding and truncation
        inputs = tokenizer(
            text,
            return_tensors='pt',
            padding='max_length',  # Pad to the specified sequence length
            truncation=True,
            max_length=max_sequence_length,
        )

        input_ids.append(inputs['input_ids'])
        attention_mask.append(inputs['attention_mask'])

    tokenized_training_data.append({
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'correct_index': correct_indices[i]
    })

# Tokenize questions and options for validation set
tokenized_validation_data = []
for i in range(len(validation_questions)):
    choices = validation_options[i]
    input_ids = []
    attention_mask = []

    for choice in choices:
        # Combine the question and choice
        text = validation_questions[i] + " " + choice

        # Tokenize and encode the text with padding and truncation
        inputs = tokenizer(
            text,
            return_tensors='pt',
            padding='max_length',  # Pad to the specified sequence length
            truncation=True,
            max_length=max_sequence_length,
        )

        input_ids.append(inputs['input_ids'])
        attention_mask.append(inputs['attention_mask'])

    tokenized_validation_data.append({
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'correct_index': validation_correct_indices[i]
    })

# Save tokenized data as numpy arrays
np.save('tokenized_training_data.npy', tokenized_training_data)
np.save('tokenized_validation_data.npy', tokenized_validation_data)

In [14]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from transformers.data.data_collator import DataCollatorWithPadding

# Load your tokenized training and validation data
tokenized_training_data = np.load('tokenized_training_data.npy', allow_pickle=True)
tokenized_validation_data = np.load('tokenized_validation_data.npy', allow_pickle=True)

# Define a custom dataset class for multiple-choice questions
class MultipleChoiceDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {
            'input_ids': self.data[idx]['input_ids'],
            'attention_mask': self.data[idx]['attention_mask'],
            'labels': self.data[idx]['correct_index']
        }

# Create DataLoader for training and validation
batch_size = 16  # Adjust as needed
train_dataset = MultipleChoiceDataset(tokenized_training_data)
train_data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

validation_dataset = MultipleChoiceDataset(tokenized_validation_data)
validation_data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define the model for multiple-choice tasks
model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased")

# Define training arguments
training_args = TrainingArguments(
    output_dir="bert_fine",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,  # Set to True if you want to push the model to the Hugging Face Model Hub
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=train_data_collator,  # Use the data collator for training
    train_dataset=train_dataset,  # Use the dataset for training
    eval_dataset=validation_dataset,  # Use the dataset for validation
    tokenizer=tokenizer,
)

# Start training
trainer.train()

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,1.072366
2,No log,0.948788
3,No log,0.852217


TrainOutput(global_step=96, training_loss=0.5160951614379883, metrics={'train_runtime': 149.6202, 'train_samples_per_second': 10.045, 'train_steps_per_second': 0.642, 'total_flos': 395452365566976.0, 'train_loss': 0.5160951614379883, 'epoch': 3.0})

In [15]:
pip install huggingface_hub



In [16]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:
trainer.push_to_hub()

events.out.tfevents.1699261586.b47965bbeb4a.810.0:   0%|          | 0.00/5.33k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.54k [00:00<?, ?B/s]

'https://huggingface.co/yashikam19/bert_fine/tree/main/'

In [19]:
from transformers import AutoTokenizer, AutoModelForMultipleChoice

In [20]:
model = AutoModelForMultipleChoice.from_pretrained("bert_fine")

In [21]:
tokenizer = AutoTokenizer.from_pretrained("bert_fine")

In [22]:
def calculate_accuracy(model, dataloader):
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():
        for batch in dataloader:
            inputs = {key: value.to(model.device) for key, value in batch.items()}
            labels = inputs.pop("labels")
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1)
            correct += torch.sum(predicted_labels == labels).item()
            total += len(labels)
    return correct / total

In [None]:
from torch.utils.data import DataLoader

# Define your batch size
batch_size = 16

# Create a DataLoader for the validation dataset
validation_dataloader = DataLoader(
    validation_dataset,
    batch_size=batch_size,
    collate_fn=validation_data_collator,  # Use your validation_data_collator
)

# Calculate validation accuracy
validation_accuracy = calculate_accuracy(model, validation_dataloader)
print(f"Validation Accuracy: {validation_accuracy:.2%}")

In [23]:
pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=a45a5b1054707e4a39c2c12b300a88e94d72f949e3bad97787c1832c37b9d634
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence_tr

In [24]:
questions_total = [entry['question'] for entry in data]

In [25]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Load a pre-trained model, for example, 'bert-base-nli-stsb-mean-tokens'
model = SentenceTransformer('bert-base-nli-stsb-mean-tokens')

# Encode the training sentences to obtain sentence embeddings
training_sentence_embeddings = model.encode(questions_total, convert_to_tensor=True)

# Calculate the cosine similarity between sentence pairs
cosine_similarity_matrix = util.pytorch_cos_sim(training_sentence_embeddings, training_sentence_embeddings)

# Convert the cosine similarity matrix to a numpy array
cosine_similarity_matrix_np = cosine_similarity_matrix.cpu().numpy()

# Now, cosine_similarity_matrix_np contains the cosine similarity between all pairs of training sentences.

Downloading (…)9ee55/.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)a65059ee55/README.md:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

Downloading (…)55/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)5059ee55/config.json:   0%|          | 0.00/630 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)9ee55/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

Downloading (…)a65059ee55/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)059ee55/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [26]:
cosine_similarity_matrix_np

array([[ 0.9999999 ,  0.9258981 ,  0.34585118, ..., -0.08517139,
        -0.06214759,  0.06444744],
       [ 0.9258981 ,  1.0000001 ,  0.34618378, ..., -0.09307888,
        -0.07968254,  0.00843755],
       [ 0.34585118,  0.34618378,  0.99999976, ..., -0.10011345,
        -0.10154794,  0.02487706],
       ...,
       [-0.08517139, -0.09307888, -0.10011345, ...,  0.9999999 ,
         0.8265262 ,  0.41374984],
       [-0.06214759, -0.07968254, -0.10154794, ...,  0.8265262 ,
         1.0000001 ,  0.3279205 ],
       [ 0.06444744,  0.00843755,  0.02487706, ...,  0.41374984,
         0.3279205 ,  0.9999999 ]], dtype=float32)

In [36]:
import numpy as np

# Assuming you have calculated the cosine_similarity_matrix_np as shown in the previous answer
# Define a similarity threshold
similarity_threshold = 0.1  # Adjust this threshold as needed

# Create a mask for similarity values above the threshold
related_mask = abs(cosine_similarity_matrix_np) > similarity_threshold

# Count the number of related pairs
num_related_pairs = np.sum(related_mask)

# Calculate the percentage of related pairs
total_pairs = cosine_similarity_matrix_np.size
percentage_related = (num_related_pairs / total_pairs) * 100

print(f"Percentage of related pairs above {similarity_threshold}: {percentage_related:.2f}%")

Percentage of related pairs above 0.1: 65.01%
