In [21]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel, Trainer, TrainingArguments
from datasets import DatasetDict
from sklearn.metrics import accuracy_score, f1_score
import numpy as np


In [22]:
from datasets import load_dataset

ds = load_dataset("PiC/phrase_similarity")

In [23]:
ds

DatasetDict({
    train: Dataset({
        features: ['phrase1', 'phrase2', 'sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 7004
    })
    validation: Dataset({
        features: ['phrase1', 'phrase2', 'sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['phrase1', 'phrase2', 'sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 2000
    })
})

In [24]:
train_label0 = ds['train'].filter(lambda x: x['label'] == 0)
train_label1 = ds['train'].filter(lambda x: x['label'] == 1)
train_label0 = train_label0.shuffle(seed=42)
train_label1 = train_label1.shuffle(seed=42)

train_label0 = train_label0.select(range(500))
train_label1 = train_label1.select(range(500))

from datasets import DatasetDict, concatenate_datasets
balanced_train = concatenate_datasets([train_label0, train_label1])

ds = DatasetDict({
    'train': balanced_train,
    'validation': ds['validation'],
    'test': ds['test']
})

In [25]:
# Initialize the BERT tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [26]:
def tokenize_function(examples):
    # Tokenize sentence1 with phrase1
    tokenized_s1 = tokenizer(
        examples['phrase1'],
        examples['sentence1'],
        truncation=True,
        padding='max_length',
        max_length=128
    )

    # Tokenize sentence2 with phrase2
    tokenized_s2 = tokenizer(
        examples['phrase2'],
        examples['sentence2'],
        truncation=True,
        padding='max_length',
        max_length=128
    )


    examples['input_ids_s1'] = tokenized_s1['input_ids']
    examples['attention_mask_s1'] = tokenized_s1['attention_mask']
    examples['input_ids_s2'] = tokenized_s2['input_ids']
    examples['attention_mask_s2'] = tokenized_s2['attention_mask']

    return examples


In [27]:
# Apply the tokenization to all splits
tokenized_ds = ds.map(tokenize_function, batched=True)

# Select relevant columns
tokenized_ds = tokenized_ds.remove_columns(['phrase1', 'phrase2', 'sentence1', 'sentence2', 'idx'])

# Rename 'label' to 'labels' as expected by Trainer - I did not know this until this!
tokenized_ds = tokenized_ds.rename_column("label", "labels")

# Set the format to PyTorch tensors
tokenized_ds.set_format(type='torch', columns=['input_ids_s1', 'attention_mask_s1',
                                              'input_ids_s2', 'attention_mask_s2', 'labels'])


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [28]:
class BaseBERTModel(nn.Module):
    def __init__(self, model_name='bert-base-uncased'):
        super(BaseBERTModel, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)

    def forward(self, input_ids_s1, attention_mask_s1, input_ids_s2, attention_mask_s2):
        # Encode sentence1
        outputs_s1 = self.bert(input_ids=input_ids_s1, attention_mask=attention_mask_s1)
        last_hidden_s1 = outputs_s1.last_hidden_state  # (batch, seq_len, hidden_dim)

        # Encode sentence2
        outputs_s2 = self.bert(input_ids=input_ids_s2, attention_mask=attention_mask_s2)
        last_hidden_s2 = outputs_s2.last_hidden_state  # (batch, seq_len, hidden_dim)

        return last_hidden_s1, last_hidden_s2


Pooling model

In [29]:
class PoolingModel(nn.Module):
    def __init__(self, model_name='bert-base-uncased', hidden_size=768, num_classes=2):
        super(PoolingModel, self).__init__()
        self.base_model = BaseBERTModel(model_name)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.classifier = nn.Linear(hidden_size * 2, num_classes)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids_s1, attention_mask_s1, input_ids_s2, attention_mask_s2, labels=None):
        last_hidden_s1, last_hidden_s2 = self.base_model(input_ids_s1, attention_mask_s1,
                                                         input_ids_s2, attention_mask_s2)
        # Apply max pooling
        pooled_s1 = self.pool(last_hidden_s1.permute(0, 2, 1)).squeeze(-1)  # (batch, hidden_dim)
        pooled_s2 = self.pool(last_hidden_s2.permute(0, 2, 1)).squeeze(-1)  # (batch, hidden_dim)

        # Concatenate pooled embeddings
        combined = torch.cat((pooled_s1, pooled_s2), dim=1)  # (batch, hidden_dim*2)

        # Classification
        logits = self.classifier(combined)  # (batch, num_classes)

        outputs = {'logits': logits}

        if labels is not None:
            loss = self.loss_fn(logits, labels)
            outputs['loss'] = loss

        return outputs


Averaging Model

In [44]:
import torch
import torch.nn as nn

class AveragingModel(nn.Module):
    def __init__(self, model_name='bert-base-uncased', hidden_size=768, num_classes=2):
        super(AveragingModel, self).__init__()
        self.base_model = BaseBERTModel(model_name)
        self.classifier = nn.Linear(hidden_size * 2, num_classes)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids_s1, attention_mask_s1, input_ids_s2, attention_mask_s2, labels=None):
        # Get last hidden states for both sequences
        last_hidden_s1, last_hidden_s2 = self.base_model(input_ids_s1, attention_mask_s1,
                                                         input_ids_s2, attention_mask_s2)
        # Compute mean pooling for s1
        mask_s1 = attention_mask_s1.unsqueeze(-1).expand(last_hidden_s1.size()).float()
        mean_s1 = torch.sum(last_hidden_s1 * mask_s1, dim=1) / torch.clamp(mask_s1.sum(dim=1), min=1e-9)

        # Compute mean pooling for s2
        mask_s2 = attention_mask_s2.unsqueeze(-1).expand(last_hidden_s2.size()).float()
        mean_s2 = torch.sum(last_hidden_s2 * mask_s2, dim=1) / torch.clamp(mask_s2.sum(dim=1), min=1e-9)

        # Concatenate mean embeddings
        combined = torch.cat((mean_s1, mean_s2), dim=1)  # (batch_size, hidden_dim * 2)

        # Classification
        logits = self.classifier(combined)  # (batch_size, num_classes)

        # Prepare outputs
        outputs = {'logits': logits}

        # Add loss if labels are provided
        if labels is not None:
            loss = self.loss_fn(logits, labels)
            outputs['loss'] = loss

        return outputs


Concatenation

In [46]:
import torch
import torch.nn as nn

class ConcatenationModel(nn.Module):
    def __init__(self, model_name='bert-base-uncased', hidden_size=768, num_classes=2):
        super(ConcatenationModel, self).__init__()
        self.base_model = BaseBERTModel(model_name)  # Replace with the actual base model initialization
        self.classifier = nn.Linear(hidden_size * 2, num_classes)  # Classification layer
        self.loss_fn = nn.CrossEntropyLoss()  # Loss function

    def forward(self, input_ids_s1, attention_mask_s1, input_ids_s2, attention_mask_s2, labels=None):
        # Forward pass through the base model to get hidden states
        last_hidden_s1, last_hidden_s2 = self.base_model(input_ids_s1, attention_mask_s1,
                                                         input_ids_s2, attention_mask_s2)

        # Extract [CLS] token embeddings
        cls_s1 = last_hidden_s1[:, 0, :]  # (batch_size, hidden_size)
        cls_s2 = last_hidden_s2[:, 0, :]  # (batch_size, hidden_size)

        # Concatenate [CLS] embeddings
        combined = torch.cat((cls_s1, cls_s2), dim=1)  # (batch_size, hidden_size * 2)

        # Classification layer
        logits = self.classifier(combined)  # (batch_size, num_classes)

        # Prepare outputs
        outputs = {'logits': logits}

        # Compute loss if labels are provided
        if labels is not None:
            loss = self.loss_fn(logits, labels)
            outputs['loss'] = loss

        return outputs


In [32]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
    }


In [33]:
def train_evaluate_model(model_class, strategy_name):
    print(f"\n=== Training Model with {strategy_name} Strategy ===")

  
    model = model_class()

    training_args = TrainingArguments(
        output_dir=f'./results_{strategy_name}',
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_dir=f'./logs_{strategy_name}',
        logging_steps=100,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_ds['train'],
        eval_dataset=tokenized_ds['validation'],
        compute_metrics=compute_metrics
    )

    trainer.train()

    # Evaluate the model
    eval_results = trainer.evaluate(tokenized_ds['validation'])
    print(f"Validation Results for {strategy_name} Strategy: {eval_results}")

    return trainer, eval_results


In [34]:
# Train Pooling Model
pooling_trainer, pooling_results = train_evaluate_model(PoolingModel, "Pooling")



=== Training Model with Pooling Strategy ===


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.704106,0.503,0.420659
2,0.670700,0.715812,0.541,0.539397
3,0.670700,0.730308,0.556,0.548367


Validation Results for Pooling Strategy: {'eval_loss': 0.7303081154823303, 'eval_accuracy': 0.556, 'eval_f1': 0.5483674092157461, 'eval_runtime': 3.2444, 'eval_samples_per_second': 308.226, 'eval_steps_per_second': 4.932, 'epoch': 3.0}


In [45]:
# Train Averaging Model
averaging_trainer, averaging_results = train_evaluate_model(AveragingModel, "Averaging")



=== Training Model with Averaging Strategy ===




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.687873,0.535,0.443763
2,0.620000,0.717776,0.61,0.596941
3,0.620000,0.981693,0.616,0.603725


Validation Results for Averaging Strategy: {'eval_loss': 0.9816929697990417, 'eval_accuracy': 0.616, 'eval_f1': 0.603724985139687, 'eval_runtime': 3.2505, 'eval_samples_per_second': 307.644, 'eval_steps_per_second': 4.922, 'epoch': 3.0}


In [47]:
# Train Concatenation Model
concatenation_trainer, concatenation_results = train_evaluate_model(ConcatenationModel, "Concatenation")



=== Training Model with Concatenation Strategy ===




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.699451,0.499,0.336402
2,0.672900,0.699509,0.584,0.581832
3,0.672900,0.85559,0.609,0.605735


Validation Results for Concatenation Strategy: {'eval_loss': 0.8555902242660522, 'eval_accuracy': 0.609, 'eval_f1': 0.6057350922993308, 'eval_runtime': 3.3395, 'eval_samples_per_second': 299.444, 'eval_steps_per_second': 4.791, 'epoch': 3.0}


In [48]:
def evaluate_on_test(trainer, strategy_name):
    print(f"\n=== Evaluating Model with {strategy_name} Strategy on Test Set ===")
    test_results = trainer.evaluate(tokenized_ds['test'])
    print(f"Test Results for {strategy_name} Strategy: {test_results}")
    return test_results

# Evaluate Pooling Model
evaluate_on_test(pooling_trainer, "Pooling")

# Evaluate Averaging Model
evaluate_on_test(averaging_trainer, "Averaging")

# Evaluate Concatenation Model
evaluate_on_test(concatenation_trainer, "Concatenation")



=== Evaluating Model with Pooling Strategy on Test Set ===


Test Results for Pooling Strategy: {'eval_loss': 0.7261282205581665, 'eval_accuracy': 0.554, 'eval_f1': 0.5466885394528348, 'eval_runtime': 6.7961, 'eval_samples_per_second': 294.285, 'eval_steps_per_second': 4.709, 'epoch': 3.0}

=== Evaluating Model with Averaging Strategy on Test Set ===


Test Results for Averaging Strategy: {'eval_loss': 1.0312038660049438, 'eval_accuracy': 0.6035, 'eval_f1': 0.5888924227896682, 'eval_runtime': 6.8001, 'eval_samples_per_second': 294.115, 'eval_steps_per_second': 4.706, 'epoch': 3.0}

=== Evaluating Model with Concatenation Strategy on Test Set ===


Test Results for Concatenation Strategy: {'eval_loss': 0.8709274530410767, 'eval_accuracy': 0.603, 'eval_f1': 0.5978651399723064, 'eval_runtime': 6.7933, 'eval_samples_per_second': 294.407, 'eval_steps_per_second': 4.711, 'epoch': 3.0}


{'eval_loss': 0.8709274530410767,
 'eval_accuracy': 0.603,
 'eval_f1': 0.5978651399723064,
 'eval_runtime': 6.7933,
 'eval_samples_per_second': 294.407,
 'eval_steps_per_second': 4.711,
 'epoch': 3.0}

---------------------------------------------------------------------------------