In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    TrainerCallback,
    EarlyStoppingCallback
)
import evaluate

# --------------------- DATA PREPARATION ---------------------
df = pd.read_csv("labelled_Hindi_Articles_2200.csv")
df.columns = df.columns.str.strip()
df = df.dropna(subset=["Heading", "Article"]).rename(columns={"Heading": "question", "Article": "context"})
df["label"] = 1

neg_df = df.copy()
neg_df["context"] = np.random.permutation(neg_df["context"].values)
neg_df["label"] = 0

combined_df = pd.concat([df, neg_df]).sample(frac=1, random_state=42).reset_index(drop=True)

# Split into train (80%), validation (10%), test (10%)
train_df, temp_df = train_test_split(combined_df, test_size=0.2, stratify=combined_df["label"], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=42)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# --------------------- TOKENIZATION ---------------------
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

def tokenize_function(examples):
    return tokenizer(
        examples["question"],
        examples["context"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

columns = ["input_ids", "attention_mask", "token_type_ids", "label"]
train_dataset.set_format(type="torch", columns=columns)
val_dataset.set_format(type="torch", columns=columns)
test_dataset.set_format(type="torch", columns=columns)

# --------------------- MODEL SETUP ---------------------
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=2
)

# --------------------- TRAINING SETUP ---------------------
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./mbert-finetuned-hindi",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,  # You can increase this if not overfitting
    weight_decay=0.01,
    learning_rate=3e-5,  # Lower learning rate for better stability
    lr_scheduler_type="linear",  # Learning rate scheduler
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
    fp16=True  # Mixed precision for faster training (if supported by your GPU)
)

# --------------------- CALLBACK FOR ACCURACY PRINTING ---------------------
class TrainValEvalCallback(TrainerCallback):
    def __init__(self, train_dataset, val_dataset):
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.trainer = None

    def set_trainer(self, trainer):
        self.trainer = trainer

    def on_epoch_end(self, args, state, control, **kwargs):
        if self.trainer is not None:
            train_metrics = self.trainer.evaluate(eval_dataset=self.train_dataset, metric_key_prefix="train")
            val_metrics = self.trainer.evaluate(eval_dataset=self.val_dataset, metric_key_prefix="eval")
            print(f"\nEpoch {int(state.epoch)}:")
            print(f"  Training Accuracy:   {train_metrics['train_accuracy']:.4f}")
            print(f"  Validation Accuracy: {val_metrics['eval_accuracy']:.4f}")
        return control

train_val_callback = TrainValEvalCallback(train_dataset, val_dataset)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[train_val_callback, EarlyStoppingCallback(early_stopping_patience=2)]
)
train_val_callback.set_trainer(trainer)

# --------------------- TRAINING ---------------------
print("\nStarting training...")
trainer.train()

# --------------------- FINAL EVALUATION ---------------------
print("\nFinal Validation Metrics:")
val_metrics = trainer.evaluate(val_dataset)
print(f"Validation Accuracy: {val_metrics['eval_accuracy']:.4f}")

print("\nFinal Test Metrics:")
test_metrics = trainer.evaluate(test_dataset)
print(f"Test Accuracy: {test_metrics['eval_accuracy']:.4f}")

# --------------------- ARTICLE RETRIEVAL FUNCTION ---------------------
def find_relevant_article(question: str, top_k=3):
    """Retrieve top-k relevant articles for a question"""
    original_contexts = df["context"].unique().tolist()
    scores = []
    for context in original_contexts:
        inputs = tokenizer(
            question,
            context,
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(model.device)
        with torch.no_grad():
            logits = model(**inputs).logits
        match_prob = torch.softmax(logits, dim=1)[0][1].item()
        scores.append((context, match_prob))
    return sorted(scores, key=lambda x: x[1], reverse=True)[:top_k]

# Example usage:
# results = find_relevant_article("Your question here")
# for idx, (context, score) in enumerate(results, 1):
#     print(f"Match {idx} (Score: {score:.4f}):\n{context}\n")


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 3518/3518 [00:03<00:00, 1147.28 examples/s]
Map: 100%|██████████| 440/440 [00:00<00:00, 1444.81 examples/s]
Map: 100%|██████████| 440/440 [00:00<00:00, 1481.02 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1236,0.155963,0.968182
2,0.1352,0.161152,0.965909
3,0.0006,0.124461,0.977273
4,0.0047,0.161923,0.972727


early stopping required metric_for_best_model, but did not find eval_accuracy so early stopping is disabled



Epoch 1:
  Training Accuracy:   0.9849
  Validation Accuracy: 0.9682


early stopping required metric_for_best_model, but did not find eval_accuracy so early stopping is disabled



Epoch 2:
  Training Accuracy:   0.9847
  Validation Accuracy: 0.9659


early stopping required metric_for_best_model, but did not find eval_accuracy so early stopping is disabled
Checkpoint destination directory ./mbert-finetuned-hindi/checkpoint-1320 already exists and is non-empty. Saving will proceed but saved results may be invalid.



Epoch 3:
  Training Accuracy:   0.9986
  Validation Accuracy: 0.9773


early stopping required metric_for_best_model, but did not find eval_accuracy so early stopping is disabled



Epoch 4:
  Training Accuracy:   0.9991
  Validation Accuracy: 0.9727

Final Validation Metrics:


Validation Accuracy: 0.9727

Final Test Metrics:
Test Accuracy: 0.9750


In [2]:
search_results = find_relevant_article("राजस्थान आबकारी अधिनियम, 1950 की धारा 45 के तहत गिरफ्तारी, जप्ती और निरोध की श?")
for i, (article, score) in enumerate(search_results, 1):
    print(f"Top {i} Match (Confidence: {score*100:.1f}%):")
    print(article[:500] + "...\n")


Top 1 Match (Confidence: 100.0%):
राजस्थान आबकारी अधिनियम, 1950 (Rajasthan Excise Act, 1950) के तहत धारा 45 एक महत्वपूर्ण प्रावधान है, जो अधिकारियों और सरकार द्वारा अधिकृत व्यक्तियों को अपराधियों को गिरफ्तार करने, सामान जब्त करने और संदिग्ध व्यक्तियों या वस्तुओं को निरोध (Detention) में रखने का अधिकार देती है। यह प्रावधान आबकारी राजस्व (Excise Revenue) की सुरक्षा सुनिश्चित करता है और अवैध गतिविधियों को रोकने में सहायक है। इस लेख में, हम धारा 45 के सभी प्रावधानों का सरल और विस्तृत तरीके से विश्लेषण करेंगे, साथ ही इसे अधिनियम की अ...

Top 2 Match (Confidence: 100.0%):
भारतीय नागरिक सुरक्षा संहिता 2023, जो 1 जुलाई, 2024 को लागू हुई, ने दंड प्रक्रिया संहिता की जगह ले ली है। यह लेख नई संहिता की धारा 45 से 50 में उल्लिखित गिरफ्तारी से संबंधित प्रक्रियाओं की व्याख्या करता है। गिरफ्तार किए जाने वाले व्यक्तियों का पीछा करना (धारा 45) धारा 45 के तहत, एक पुलिस अधिकारी के पास भारत के किसी भी स्थान पर, बिना वारंट के भी, किसी भी व्यक्ति का पीछा करने का अधिकार है, जिसे वे गिरफ्तार करने के लिए अधिकृत 