In [1]:
# Install necessary libraries
!pip install transformers datasets

# Import libraries
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertModel, Trainer, TrainingArguments
from datasets import DatasetDict, concatenate_datasets
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import random
import os


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [2]:
from datasets import load_dataset

ds = load_dataset("PiC/phrase_similarity")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.47k [00:00<?, ?B/s]

phrase_similarity.py:   0%|          | 0.00/4.75k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/1.42M [00:00<?, ?B/s]

PS-hard/validation/0000.parquet:   0%|          | 0.00/202k [00:00<?, ?B/s]

PS-hard/test/0000.parquet:   0%|          | 0.00/403k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7004 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [3]:
train_label0 = ds['train'].filter(lambda x: x['label'] == 0)
train_label1 = ds['train'].filter(lambda x: x['label'] == 1)

train_label0 = train_label0.shuffle(seed=42)
train_label1 = train_label1.shuffle(seed=42)

train_label0 = train_label0.select(range(500))
train_label1 = train_label1.select(range(500))

balanced_train = concatenate_datasets([train_label0, train_label1])

balanced_ds = DatasetDict({
    'train': balanced_train,
    'validation': ds['validation'],
    'test': ds['test']
})

print("Balanced Training Set Labels Distribution:")
label_counts = torch.bincount(torch.tensor(balanced_ds['train']['label']))
labels = [balanced_ds['train'].features['label'].int2str(i) for i in range(len(label_counts))]
for label, count in zip(labels, label_counts):
    print(f"Label {label}: {count}")


Filter:   0%|          | 0/7004 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7004 [00:00<?, ? examples/s]

Balanced Training Set Labels Distribution:
Label negative: 500
Label positive: 500


In [4]:
# Initialize the DistilBERT tokenizer
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    # Tokenize sentence1 with phrase1
    tokenized_s1 = tokenizer(
        examples['phrase1'],
        examples['sentence1'],
        truncation=True,
        padding='longest',  # Dynamic padding
        max_length=64
    )

    # Tokenize sentence2 with phrase2
    tokenized_s2 = tokenizer(
        examples['phrase2'],
        examples['sentence2'],
        truncation=True,
        padding='longest',  # Dynamic padding
        max_length=64
    )

    # Add the tokenized inputs to the examples
    examples['input_ids_s1'] = tokenized_s1['input_ids']
    examples['attention_mask_s1'] = tokenized_s1['attention_mask']
    examples['input_ids_s2'] = tokenized_s2['input_ids']
    examples['attention_mask_s2'] = tokenized_s2['attention_mask']

    return examples

# Apply the tokenization to all splits in the balanced dataset
tokenized_ds = balanced_ds.map(tokenize_function, batched=True)

# Remove original text columns as they are no longer needed
tokenized_ds = tokenized_ds.remove_columns(['phrase1', 'phrase2', 'sentence1', 'sentence2', 'idx'])

# Rename 'label' to 'labels' as expected by Trainer
tokenized_ds = tokenized_ds.rename_column("label", "labels")

# Set the format to PyTorch tensors
tokenized_ds.set_format(type='torch', columns=['input_ids_s1', 'attention_mask_s1',
                                              'input_ids_s2', 'attention_mask_s2', 'labels'])


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [5]:
def create_siamese_dataset(dataset, num_pairs=1000):
    dataset = dataset.shuffle(seed=42)

    # Select num_pairs/2 similar and num_pairs/2 dissimilar pairs
    similar_pairs = dataset.filter(lambda x: x['labels'] == 1).select(range(num_pairs//2))
    dissimilar_pairs = dataset.filter(lambda x: x['labels'] == 0).select(range(num_pairs//2))

    # Combine similar and dissimilar pairs
    paired_samples = []
    for sample in similar_pairs:
        paired_samples.append({
            'input_ids_1': sample['input_ids_s1'],
            'attention_mask_1': sample['attention_mask_s1'],
            'input_ids_2': sample['input_ids_s2'],
            'attention_mask_2': sample['attention_mask_s2'],
            'labels': 1
        })
    for sample in dissimilar_pairs:
        paired_samples.append({
            'input_ids_1': sample['input_ids_s1'],
            'attention_mask_1': sample['attention_mask_s1'],
            'input_ids_2': sample['input_ids_s2'],
            'attention_mask_2': sample['attention_mask_s2'],
            'labels': 0
        })

    # Shuffle the paired samples
    random.shuffle(paired_samples)

    # Create a new DatasetDict
    siamese_dataset = DatasetDict({
        'train': paired_samples[:800],
        'validation': paired_samples[800:900],
        'test': paired_samples[900:]
    })

    return siamese_dataset

siamese_ds = create_siamese_dataset(tokenized_ds['train'], num_pairs=1000)

print("Example Siamese Pair Sample:")
print(siamese_ds['train'][0])


Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Example Siamese Pair Sample:
{'input_ids_1': tensor([  101,  2364,  9141,   102,  2011,  2008,  2051,  1010,  2053,  2784,
        23849,  2001,  3378,  2007,  1996,  2364,  9141,  1997,  5965,  1010,
         2975,  1996,  2415,  3929,  6086,  1012,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0]), 'attention_mask_1': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0]), 'input_ids_2': tensor([  101,  2350,  9963,   102,  2011,  2008,  2051,  1010,  2053,  2784,
        23849,  2001,  3378,  2007,  1996,  2350,  9963,  1997,  5965,  1010,
         2975,  1996,  2415,  3929,  6086,  1012,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0, 

In [6]:
class SiameseModel(nn.Module):
    def __init__(self, model_name='distilbert-base-uncased', embedding_dim=256):
        super(SiameseModel, self).__init__()
        self.bert = DistilBertModel.from_pretrained(model_name)
        self.fc = nn.Linear(self.bert.config.hidden_size, embedding_dim)
        self.relu = nn.ReLU()
        self.cosine_similarity = nn.CosineSimilarity(dim=1)
        self.sigmoid = nn.Sigmoid()

    def encode(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        embedding = self.fc(cls_output)
        embedding = self.relu(embedding)
        return embedding

    def forward(self, input_ids_1, attention_mask_1, input_ids_2, attention_mask_2, labels=None):
        embedding1 = self.encode(input_ids_1, attention_mask_1)
        embedding2 = self.encode(input_ids_2, attention_mask_2)
        cosine_sim = self.cosine_similarity(embedding1, embedding2)
        logits = self.sigmoid(cosine_sim)

        outputs = {'logits': logits}

        if labels is not None:
            loss_fn = nn.BCELoss()
            loss = loss_fn(logits, labels.float())
            outputs['loss'] = loss

        return outputs


In [7]:
def compute_metrics_classification(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
    }

def compute_metrics_siamese(pred):
    labels = pred.label_ids
    preds = (pred.predictions > 0.5).astype(int)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
    }


In [13]:
from transformers import DataCollatorWithPadding

# Custom data collator to handle Siamese inputs
class SiameseDataCollatorWithPadding(DataCollatorWithPadding):
    def __call__(self, features):
        # Separate the inputs for the two branches of the Siamese network
        features_1 = [{'input_ids': f['input_ids_1'], 'attention_mask': f['attention_mask_1']} for f in features]
        features_2 = [{'input_ids': f['input_ids_2'], 'attention_mask': f['attention_mask_2']} for f in features]

        # Pad the inputs for each branch separately - I am still uncertain about this part of the code
        batch_1 = super().__call__(features_1)
        batch_2 = super().__call__(features_2)

        # Combine the padded inputs and labels into a single batch
        batch = {
            'input_ids_1': batch_1['input_ids'],
            'attention_mask_1': batch_1['attention_mask'],
            'input_ids_2': batch_2['input_ids'],
            'attention_mask_2': batch_2['attention_mask'],
            'labels': torch.tensor([f['labels'] for f in features]),
        }

        return batch

# Initialize the custom data collator
siamese_data_collator = SiameseDataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
# Define training arguments
training_args_siamese = TrainingArguments(
    output_dir='./results_siamese',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs_siamese',
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    fp16=True,  # Enable mixed precision
    report_to=[],  # Disable Weights & Biases
)




In [15]:
# Initialize SiameseModel
siamese_model = SiameseModel(model_name='distilbert-base-uncased', embedding_dim=256)

# Initialize Trainer
trainer_siamese = Trainer(
    model=siamese_model,
    args=training_args_siamese,
    train_dataset=siamese_ds['train'],
    eval_dataset=siamese_ds['validation'],
    compute_metrics=compute_metrics_siamese,
    data_collator=siamese_data_collator
)


In [16]:
# Train SiameseModel
trainer_siamese.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.739,0.708395,0.49,0.322282
2,0.6593,0.700605,0.49,0.322282
3,0.6127,0.696852,0.49,0.322282


TrainOutput(global_step=150, training_loss=0.670342140197754, metrics={'train_runtime': 1531.3967, 'train_samples_per_second': 1.567, 'train_steps_per_second': 0.098, 'total_flos': 0.0, 'train_loss': 0.670342140197754, 'epoch': 3.0})

In [26]:
# Inspect the first sample in the test set after tokenization
print("Test Set Sample:", tokenized_ds['train'][0])


Test Set Sample: {'labels': tensor(0), 'input_ids_s1': tensor([ 101, 2434, 2600,  102, 2493, 3443, 2434, 2600, 1010, 5099, 1011, 6154,
        1010, 3769, 1010, 3968, 2213, 1010, 8036, 1010, 1998, 3458, 1999, 1996,
        2110, 1011, 1997, 1011, 1996, 1011, 2396, 4943, 1044, 1012, 1055, 1012,
         102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0]), 'attention_mask_s1': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0]), 'input_ids_s2': tensor([ 101, 2034, 2962,  102, 2493, 3443, 2034, 2962, 1010, 5099, 1011, 6154,
        1010, 3769, 1010, 3968, 2213, 1010, 8036, 1010, 1998, 3458, 1999, 1996,
        2110, 1011, 1997, 1011, 1996, 1011, 2396, 4943, 1044, 1012, 1055, 1012,
         102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0]), 'attention_mask

In [29]:
# Evaluate SiameseModel on Test Set
siamese_test_results = trainer_siamese.evaluate(siamese_ds['train'])
print(f"Siamese Model Test Results: {siamese_test_results}")


Siamese Model Test Results: {'eval_loss': 0.660474419593811, 'eval_accuracy': 0.49875, 'eval_f1': 0.33194537114261885, 'eval_runtime': 137.6991, 'eval_samples_per_second': 5.81, 'eval_steps_per_second': 0.363, 'epoch': 3.0}


In [30]:
siamese_ds_test = create_siamese_dataset(tokenized_ds['test'], num_pairs=1000)

Filter:   0%|          | 0/2000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [31]:
siamese_test_results = trainer_siamese.evaluate(siamese_ds_test['test'])
print(f"Siamese Model Test Results: {siamese_test_results}")

Siamese Model Test Results: {'eval_loss': 0.6995209455490112, 'eval_accuracy': 0.52, 'eval_f1': 0.35578947368421054, 'eval_runtime': 18.9927, 'eval_samples_per_second': 5.265, 'eval_steps_per_second': 0.369, 'epoch': 3.0}
