# 🔗 BERT Sentence Similarity
Fine-tune a BERT model to predict similarity scores between sentence pairs using the STS-B dataset.

## 📦 Install Dependencies

In [None]:
!pip install -q transformers datasets scikit-learn

## 📚 Load STS-B Dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset("glue", "stsb")
dataset = dataset.map(lambda x: {"label": float(x["label"]) / 5.0})  # Normalize to [0,1]
dataset = dataset.remove_columns(["idx"])

## 🔤 Tokenize Sentence Pairs

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True, padding="max_length")

tokenized = dataset.map(tokenize, batched=True)
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

## 🧠 Fine-tune BERT

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)

training_args = TrainingArguments(
    output_dir="./outputs",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"]
)

trainer.train()

## 📈 Evaluate on Validation Set

In [None]:
import numpy as np
from scipy.stats import pearsonr, spearmanr

preds = trainer.predict(tokenized["validation"])
pred_scores = preds.predictions.squeeze()
labels = preds.label_ids

print("Pearson:", pearsonr(pred_scores, labels)[0])
print("Spearman:", spearmanr(pred_scores, labels)[0])

## 🔍 Inference Example

In [None]:
def compute_similarity(sent1, sent2):
    tokens = tokenizer(sent1, sent2, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        output = model(**tokens)
    return output.logits.item()

compute_similarity("A man is walking a dog.", "A person is walking an animal.")