In [None]:
# ======================================
# üß† Indian Law Contract Simplifier ‚Äî Fine-tuning Notebook
# ======================================

!pip install transformers datasets accelerate bitsandbytes -q

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import torch

In [None]:
# ==========================================================
# 1Ô∏è‚É£  Load your pre-processed dataset
# (Make sure you've uploaded constitution_instruction.json to Colab)
# ==========================================================

dataset = load_dataset("json", data_files="constitution_instruction.json")
dataset = dataset["train"].train_test_split(test_size=0.1)

print(dataset)

In [None]:
# ==========================================================
# 2Ô∏è‚É£  Choose model (Flan-T5 Large recommended)
# ==========================================================

model_name = "google/flan-t5-large"   # alternatives: flan-t5-base for lower GPU usage
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:

# ==========================================================
# 3Ô∏è‚É£  Preprocess data
# ==========================================================

max_input = 512
max_output = 256

def preprocess(examples):
    inputs = [f"{inst} {inp}" for inst, inp in zip(examples["instruction"], examples["input"])]
    model_inputs = tokenizer(inputs, max_length=max_input, truncation=True)
    labels = tokenizer(examples["output"], max_length=max_output, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = dataset.map(preprocess, batched=True)

In [None]:

# ==========================================================
# 4Ô∏è‚É£  Set up training arguments
# ==========================================================

args = TrainingArguments(
    output_dir="./contract_simplifier_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    logging_dir="./logs",
    logging_strategy="steps",
    push_to_hub=False,
)


In [None]:

# ==========================================================
# 5Ô∏è‚É£  Trainer setup
# ==========================================================

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

In [None]:

# ==========================================================
# 6Ô∏è‚É£  Save model
# ==========================================================
trainer.save_model("./simplifier_flan_t5")
tokenizer.save_pretrained("./simplifier_flan_t5")

print("‚úÖ Fine-tuning complete! Model saved to ./simplifier_flan_t5")


In [None]:
# ==========================================================
# 7Ô∏è‚É£  Inference Test using the trained model
# ==========================================================
from transformers import pipeline

simplifier = pipeline("text2text-generation", model="./simplifier_flan_t5")

text = "India, that is Bharat, shall be a Union of States."
result = simplifier(f"Simplify this clause in simple English: {text}")

print("\nüîπSimplified Output:\n", result[0]["generated_text"])