In [1]:
# Import libraries
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
from datasets import load_dataset

ds = load_dataset("PiC/phrase_similarity")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.47k [00:00<?, ?B/s]

phrase_similarity.py:   0%|          | 0.00/4.75k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/1.42M [00:00<?, ?B/s]

PS-hard/validation/0000.parquet:   0%|          | 0.00/202k [00:00<?, ?B/s]

PS-hard/test/0000.parquet:   0%|          | 0.00/403k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7004 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [3]:
train_label0 = ds['train'].filter(lambda x: x['label'] == 0)
train_label1 = ds['train'].filter(lambda x: x['label'] == 1)

# Shuffle the datasets to ensure randomness
train_label0 = train_label0.shuffle(seed=42)
train_label1 = train_label1.shuffle(seed=42)

# Select 500 samples from each label
train_label0 = train_label0.select(range(500))
train_label1 = train_label1.select(range(500))
from datasets import DatasetDict, concatenate_datasets
# Concatenate the two subsets to form a balanced training set
balanced_train = concatenate_datasets([train_label0, train_label1])

# Create a new DatasetDict with the balanced training set
ds = DatasetDict({
    'train': balanced_train,
    'validation': ds['validation'],
    'test': ds['test']
})

Filter:   0%|          | 0/7004 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7004 [00:00<?, ? examples/s]

In [4]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from datasets import load_dataset

# Load tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    tokenized_phrases = tokenizer(
        examples['phrase1'], examples['phrase2'], truncation=True, padding='max_length', max_length=128
    )
    tokenized_s1 = tokenizer(
        examples['phrase1'], examples['sentence1'], truncation=True, padding='max_length', max_length=128
    )
    tokenized_s2 = tokenizer(
        examples['phrase2'], examples['sentence2'], truncation=True, padding='max_length', max_length=128
    )

    examples['input_ids'] = tokenized_phrases['input_ids']
    examples['attention_mask'] = tokenized_phrases['attention_mask']
    examples['input_ids_s1'] = tokenized_s1['input_ids']
    examples['attention_mask_s1'] = tokenized_s1['attention_mask']
    examples['input_ids_s2'] = tokenized_s2['input_ids']
    examples['attention_mask_s2'] = tokenized_s2['attention_mask']

    return examples

# Tokenize dataset
tokenized_ds = ds.map(tokenize_function, batched=True)
tokenized_ds = tokenized_ds.remove_columns(['phrase1', 'phrase2', 'sentence1', 'sentence2', 'idx'])
tokenized_ds = tokenized_ds.rename_column("label", "labels")

tokenized_ds.set_format(type='torch', columns=['input_ids', 'attention_mask',
                                               'input_ids_s1', 'attention_mask_s1',
                                               'input_ids_s2', 'attention_mask_s2', 'labels'])

# DataLoader
from torch.utils.data import DataLoader
train_loader = DataLoader(tokenized_ds['train'], batch_size=16, shuffle=True)
val_loader = DataLoader(tokenized_ds['validation'], batch_size=16)

# Define the model
class PhraseSimilarityModel(nn.Module):
    def __init__(self, model_name='bert-base-uncased', hidden_size=768, num_classes=2):
        super(PhraseSimilarityModel, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.phrase_linear = nn.Linear(hidden_size, hidden_size)
        self.context_linear = nn.Linear(hidden_size, hidden_size)
        self.classifier = nn.Linear(hidden_size * 3, num_classes)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask,
                input_ids_s1, attention_mask_s1,
                input_ids_s2, attention_mask_s2, labels=None):
        # Move data to GPU
        input_ids = input_ids.to('cuda')
        attention_mask = attention_mask.to('cuda')
        input_ids_s1 = input_ids_s1.to('cuda')
        attention_mask_s1 = attention_mask_s1.to('cuda')
        input_ids_s2 = input_ids_s2.to('cuda')
        attention_mask_s2 = attention_mask_s2.to('cuda')
        if labels is not None:
            labels = labels.to('cuda')

        # Encode phrases (direct comparison)
        phrase_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        phrase_pooled = phrase_outputs.pooler_output

        # Encode phrase1 with sentence1
        context_s1_outputs = self.bert(input_ids=input_ids_s1, attention_mask=attention_mask_s1)
        context_s1_pooled = context_s1_outputs.pooler_output

        # Encode phrase2 with sentence2
        context_s2_outputs = self.bert(input_ids=input_ids_s2, attention_mask=attention_mask_s2)
        context_s2_pooled = context_s2_outputs.pooler_output

        # Weighted attention to prioritize phrases
        weighted_phrases = self.phrase_linear(phrase_pooled)
        weighted_context_s1 = self.context_linear(context_s1_pooled)
        weighted_context_s2 = self.context_linear(context_s2_pooled)

        # Combine embeddings
        combined = torch.cat((weighted_phrases, weighted_context_s1, weighted_context_s2), dim=1)

        # Classification
        logits = self.classifier(combined)

        outputs = {'logits': logits}
        if labels is not None:
            loss = self.loss_fn(logits, labels)
            outputs['loss'] = loss

        return outputs

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Instantiate the model and move to GPU
model = PhraseSimilarityModel(model_name).to(device)

# Optimizer and Scheduler
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['validation'],
    tokenizer=tokenizer,
    data_collator=None
)

# Train the model
trainer.train()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.6501,0.63942
2,0.468,0.702389
3,0.1038,0.879


TrainOutput(global_step=189, training_loss=0.433122160573485, metrics={'train_runtime': 301.7727, 'train_samples_per_second': 9.941, 'train_steps_per_second': 0.626, 'total_flos': 0.0, 'train_loss': 0.433122160573485, 'epoch': 3.0})

In [5]:
from sklearn.metrics import accuracy_score, f1_score
import torch

def evaluate_model(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            # Move data to GPU
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            input_ids_s1 = batch['input_ids_s1'].to(device)
            attention_mask_s1 = batch['attention_mask_s1'].to(device)
            input_ids_s2 = batch['input_ids_s2'].to(device)
            attention_mask_s2 = batch['attention_mask_s2'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                input_ids_s1=input_ids_s1,
                attention_mask_s1=attention_mask_s1,
                input_ids_s2=input_ids_s2,
                attention_mask_s2=attention_mask_s2
            )
            logits = outputs['logits']

            # Predictions
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')  # Weighted F1-score

    return {
        "accuracy": accuracy,
        "f1_score": f1
    }

# Test DataLoader
test_loader = DataLoader(tokenized_ds['test'], batch_size=16, shuffle=False)

# Evaluate the model
evaluation_metrics = evaluate_model(model, test_loader, device)
print("Test Evaluation Metrics:")
print(f"Accuracy: {evaluation_metrics['accuracy']:.4f}")
print(f"F1 Score: {evaluation_metrics['f1_score']:.4f}")


Test Evaluation Metrics:
Accuracy: 0.6430
F1 Score: 0.6404
