In [3]:
!pip install -q transformers datasets peft accelerate bitsandbytes sentencepiece

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [16]:
from sklearn.model_selection import train_test_split

In [4]:
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
    default_data_collator
)
from peft import LoraConfig, get_peft_model, TaskType
import json

In [5]:
print(f"GPU Available: {torch.cuda.is_available()}")
print(f"GPU Name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

GPU Available: True
GPU Name: Tesla T4


In [6]:
from google.colab import files
import os

uploaded = files.upload()

Saving IndicLegalQA Dataset_10K_Revised.json to IndicLegalQA Dataset_10K_Revised (1).json


In [50]:
filename = list(uploaded.keys())[0]
file_path = f'/content/{filename}'

In [8]:
df = pd.read_json(file_path)

In [9]:
print(f"Number of rows: {len(df)}")
print(f"Columns: {df.columns.tolist()}")
print("\nFirst few rows:")
print(df.head())

Number of rows: 10000
Columns: ['case_name', 'judgement_date', 'question', 'answer']

First few rows:
                                      case_name   judgement_date  \
0  Union of India vs. Maj. Gen. Manomoy Ganguly  1st August 2018   
1  Union of India vs. Maj. Gen. Manomoy Ganguly  1st August 2018   
2  Union of India vs. Maj. Gen. Manomoy Ganguly  1st August 2018   
3  Union of India vs. Maj. Gen. Manomoy Ganguly  1st August 2018   
4  Union of India vs. Maj. Gen. Manomoy Ganguly  1st August 2018   

                                            question  \
0  Who is the respondent in the case Union of Ind...   
1  What was the main issue in the case Union of I...   
2  What decision did the Armed Forces Tribunal (A...   
3  What was the reason given by the appellants fo...   
4  How did the Supreme Court of India rule on the...   

                                              answer  
0       The respondent is Maj. Gen. Manomoy Ganguly.  
1  The main issue was Maj. Gen. Manomoy Ga

In [51]:
unique_cases = df['case_name'].nunique()
print(f"Unique cases: {unique_cases}")
print(f"Total Q&A pairs: {len(df)}")
print(f"Avg questions per case: {len(df) / unique_cases:.1f}")

Unique cases: 1253
Total Q&A pairs: 10000
Avg questions per case: 8.0


In [52]:
print("Top 5 cases by number of questions:")
top_cases = df['case_name'].value_counts().head(5)
for case, count in top_cases.items():
    print(f"   {case[:60]}... → {count} questions")

Top 5 cases by number of questions:
   M/s. Daiichi Sankyo Company Limited vs Oscar Investments Lim... → 21 questions
   Satpal vs. State of Haryana... → 21 questions
   State of U.P. & Anr. vs Baleshwar Singh & Ors.... → 14 questions
   IQ City Foundation & Anr. vs. Union of India & Ors.... → 14 questions
   Future Coupons Private Limited & Ors. vs. Amazon.com NV Inve... → 13 questions


In [53]:
def prepare_retrieval_data(df):
    data = []
    skipped = 0

    for idx, row in df.iterrows():
        try:
            case_name = str(row['case_name']).strip()
            judgement_date = str(row['judgement_date']).strip()
            question = str(row['question']).strip()
            answer = str(row['answer']).strip()

            if not question or not answer or not case_name:
                skipped += 1
                continue

            # Format: "Case: [name] | Date: [date] | Question: [question]"
            input_text = f"Case: {case_name} | Date: {judgement_date} | Question: {question}"

            data.append({
                'input_text': input_text,
                'answer': answer,
                'case_name': case_name,
                'judgement_date': judgement_date,
                'question': question
            })

        except Exception as e:
            skipped += 1
            if skipped <= 5:
                print(f"⚠️ Skipped row {idx}: {str(e)[:50]}")
            continue

    print(f"\nPrepared {len(data)} training samples")
    if skipped > 0:
        print(f"Skipped {skipped} samples (empty fields)")

    return data

prepared_data = prepare_retrieval_data(df)


Prepared 10000 training samples


In [15]:
sample = prepared_data[0]
print(f"Input Text:\n  {sample['input_text']}")
print(f"\nAnswer:\n  {sample['answer']}")

Input Text:
  Case: Union of India vs. Maj. Gen. Manomoy Ganguly | Date: 1st August 2018 | Question: Who is the respondent in the case Union of India vs. Maj. Gen. Manomoy Ganguly?

Answer:
  The respondent is Maj. Gen. Manomoy Ganguly.


In [17]:
train_data, val_data = train_test_split(prepared_data, test_size=0.2, random_state=42)

dataset = DatasetDict({
    'train': Dataset.from_list(train_data),
    'validation': Dataset.from_list(val_data)
})

In [54]:
print(f"Train samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['validation'])}")

Train samples: 8000
Validation samples: 2000


In [19]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [20]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [21]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=1  # Regression task
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_lin", "v_lin"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

model = get_peft_model(model, lora_config)
print(f"✅ LoRA configured")
model.print_trainable_parameters()

✅ LoRA configured
trainable params: 738,817 || all params: 67,693,058 || trainable%: 1.0914


In [23]:
max_length = 384

def tokenize_function(examples):

    texts = []
    for inp, ans in zip(examples["input_text"], examples["answer"]):
        combined = f"{inp} [SEP] {ans}"
        texts.append(combined)

    # Tokenize
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=max_length,
        padding="max_length",
        return_tensors=None
    )

    return tokenized

In [24]:
print("Tokenizing training set...")
tokenized_train = dataset["train"].map(
    tokenize_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
    desc="Tokenizing train"
)

Tokenizing training set...


Tokenizing train:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [25]:
print("Tokenizing validation set...")
tokenized_val = dataset["validation"].map(
    tokenize_function,
    batched=True,
    remove_columns=dataset["validation"].column_names,
    desc="Tokenizing validation"
)

Tokenizing validation set...


Tokenizing validation:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [26]:
tokenized_dataset = DatasetDict({
    'train': tokenized_train,
    'validation': tokenized_val
})

In [55]:
def add_labels(examples):
    """Add label for regression: 1.0 for positive pairs"""
    examples['labels'] = [1.0] * len(examples['input_ids'])
    return examples

tokenized_dataset = tokenized_dataset.map(add_labels, batched=True)

print(f"Tokenization complete!")
print(f"Train samples: {len(tokenized_dataset['train'])}")
print(f"Val samples: {len(tokenized_dataset['validation'])}")
print(f"Token length: {max_length}")

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Tokenization complete!
Train samples: 8000
Val samples: 2000
Token length: 384


In [33]:
training_args = TrainingArguments(
    output_dir="./distilbert-indiclegal-lora",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps=500,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    save_total_limit=2,
    push_to_hub=False,
    fp16=True,
    gradient_accumulation_steps=2,
    report_to="none",  # Disable wandb
)

In [34]:
print("✅ Training configuration:")
print(f"   Epochs: {training_args.num_train_epochs}")
print(f"   Batch size: {training_args.per_device_train_batch_size}")
print(f"   Learning rate: {training_args.learning_rate}")
print(f"   Mixed precision (FP16): {training_args.fp16}")
print(f"   Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"   Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")

✅ Training configuration:
   Epochs: 3
   Batch size: 16
   Learning rate: 0.0003
   Mixed precision (FP16): True
   Gradient accumulation: 2
   Effective batch size: 32


In [56]:
steps_per_epoch = len(tokenized_dataset['train']) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)
total_steps = steps_per_epoch * training_args.num_train_epochs
approx_time = (total_steps * 0.5) / 60  # ~0.5 sec per step

print(f"Estimated training time: {approx_time:.1f} minutes")


Estimated training time: 6.2 minutes


In [37]:
from transformers import DataCollatorWithPadding


In [38]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

  trainer = Trainer(


In [39]:
train_result = trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0035,0.000426
2,0.0017,0.00113
3,0.0013,7e-05


In [41]:
eval_results = trainer.evaluate()

In [42]:
print("Evaluation Results:")
for key, value in eval_results.items():
    if isinstance(value, float):
        print(f"   {key}: {value:.4f}")
    else:
        print(f"   {key}: {value}")

Evaluation Results:
   eval_loss: 0.0001
   eval_runtime: 10.2601
   eval_samples_per_second: 194.9300
   eval_steps_per_second: 12.1830
   epoch: 3.0000


In [44]:
save_dir = "./distilbert-indiclegal-final"

# Save model
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"✅ Model saved to: {save_dir}")

✅ Model saved to: ./distilbert-indiclegal-final


In [57]:
print("\n⬇️ Downloading model...")
from google.colab import files
files.download('distilbert-indiclegal-final.zip')


⬇️ Downloading model...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [48]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from peft import PeftModel

# 1. Load base model
base_model_name = "distilbert-base-uncased"
base_model = AutoModelForQuestionAnswering.from_pretrained(base_model_name)

# 2. Load adapter (LoRA)
adapter_path = "./distilbert-indiclegal-final"
model = PeftModel.from_pretrained(base_model, adapter_path)

# 3. Merge adapter weights into base model
model = model.merge_and_unload()

# 4. Save final merged model
save_path = "./merged_indiclegal_model"
model.save_pretrained(save_path)

# 5. Also save tokenizer
tokenizer = AutoTokenizer.from_pretrained(adapter_path)
tokenizer.save_pretrained(save_path)

print("✅ Merged model saved at", save_path)


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Merged model saved at ./merged_indiclegal_model


In [49]:
from google.colab import files
!zip -r merged_indiclegal_model.zip ./merged_indiclegal_model
files.download('merged_indiclegal_model.zip')


  adding: merged_indiclegal_model/ (stored 0%)
  adding: merged_indiclegal_model/vocab.txt (deflated 53%)
  adding: merged_indiclegal_model/special_tokens_map.json (deflated 80%)
  adding: merged_indiclegal_model/model.safetensors (deflated 8%)
  adding: merged_indiclegal_model/tokenizer.json (deflated 71%)
  adding: merged_indiclegal_model/tokenizer_config.json (deflated 73%)
  adding: merged_indiclegal_model/config.json (deflated 43%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>