# 🧷 BERT Multi-label Text Classification
Fine-tune BERT to classify documents into multiple categories using sigmoid output and binary cross-entropy loss.

## 📦 Install Dependencies

In [None]:
!pip install -q transformers datasets scikit-learn torch

## 📚 Sample Dataset
We simulate a multi-label dataset here. Replace this with your actual dataset.

In [None]:
import pandas as pd
df = pd.DataFrame({
    "text": [
        "This contract involves legal and financial terms.",
        "Patient records and insurance details are discussed.",
        "Project timeline and scope management.",
        "Quarterly financial report with budget updates."
    ],
    "labels": [
        ["legal", "finance"],
        ["healthcare", "insurance"],
        ["project_management"],
        ["finance"]
    ]
})
df

## 🏷️ Encode Multi-labels

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
label_matrix = mlb.fit_transform(df["labels"])
label_matrix, mlb.classes_

## 🔤 Tokenize Text

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

encodings = tokenizer(list(df["text"]), truncation=True, padding=True)
import torch
class MultiLabelDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | {
            "labels": torch.tensor(self.labels[idx], dtype=torch.float)
        }
    def __len__(self):
        return len(self.labels)

dataset = MultiLabelDataset(encodings, label_matrix)

## 🧠 Fine-tune BERT

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(mlb.classes_),
    problem_type="multi_label_classification"
)

args = TrainingArguments(
    output_dir="./outputs",
    per_device_train_batch_size=2,
    num_train_epochs=5,
    logging_dir="./logs",
    evaluation_strategy="no"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset
)

trainer.train()

## 🔍 Inference

In [None]:
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = torch.sigmoid(logits).squeeze().tolist()
    return [mlb.classes_[i] for i, p in enumerate(probs) if p >= 0.5]

predict("The document includes both legal obligations and financial clauses.")