In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d uwelcomem/grammar-correction-data

Dataset URL: https://www.kaggle.com/datasets/uwelcomem/grammar-correction-data
License(s): unknown
Downloading grammar-correction-data.zip to /content
  0% 0.00/5.49M [00:00<?, ?B/s]
100% 5.49M/5.49M [00:00<00:00, 878MB/s]


In [None]:

import zipfile
zip_ref = zipfile.ZipFile('/content/grammar-correction-data.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()


In [None]:
import pandas as pd
df=pd.read_csv('/content/dataset/train_ds.csv')
df.head()

Unnamed: 0,source,target
0,double story bed curtain around,[start] A double story bed with curtains all a...
1,man ride motorcycle next man in limo,[start] A man riding a motorcycle next to a ma...
2,extravagant bedroom focus on chandelier,[start] An extravagant bedroom with focus on t...
3,dog herding sheep people watching,[start] a dog is herding some sheep and some p...
4,cat lay on floor paw in shoe,[start] A cat is laying on the floor with its ...


In [None]:
# Format for instruction tuning
df["input"] = "correct grammar: " + df["source"]
df["output"] = df["target"].str.replace(r"\[start\]|\[end\]", "", regex=True).str.strip()

# Final DataFrame
df = df[["input", "output"]]
df.head()

Unnamed: 0,input,output
0,correct grammar: double story bed curtain around,A double story bed with curtains all around.
1,correct grammar: man ride motorcycle next man ...,A man riding a motorcycle next to a man in a l...
2,correct grammar: extravagant bedroom focus on ...,An extravagant bedroom with focus on the chand...
3,correct grammar: dog herding sheep people watc...,a dog is herding some sheep and some people ar...
4,correct grammar: cat lay on floor paw in shoe,A cat is laying on the floor with its paws in ...


In [None]:
print(df.columns)


Index(['input', 'output'], dtype='object')


In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)


In [None]:
train_dataset = train_dataset.remove_columns(["__index_level_0__"])
val_dataset = val_dataset.remove_columns(["__index_level_0__"])


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import get_peft_model, LoraConfig, TaskType

model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)
model = get_peft_model(base_model, lora_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
def tokenize_function(example):
    model_inputs = tokenizer(
        example["input"],
        max_length=128,
        padding="max_length",
        truncation=True,
    )

    labels = tokenizer(
        example["output"],
        max_length=128,
        padding="max_length",
        truncation=True,
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_val = val_dataset.map(tokenize_function, batched=True, remove_columns=val_dataset.column_names)


Map:   0%|          | 0/88766 [00:00<?, ? examples/s]

Map:   0%|          | 0/9863 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./grammar_correction_lora",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-4,
    num_train_epochs=2,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    report_to="none",
    remove_unused_columns=False
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator
)


  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:

trainer.train()


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.0736,0.054677
2,0.0648,0.051459


TrainOutput(global_step=22192, training_loss=0.20502656366388705, metrics={'train_runtime': 9370.315, 'train_samples_per_second': 18.946, 'train_steps_per_second': 2.368, 'total_flos': 3.051221452849152e+16, 'train_loss': 0.20502656366388705, 'epoch': 2.0})

In [None]:
output_dir = "flan_t5_lora_model"

# Save LoRA model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
import shutil

# Create a zip file of the saved model directory
shutil.make_archive(output_dir, 'zip', output_dir)
from google.colab import files

# Download the zipped model
files.download(f"{output_dir}.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel

model_dir = "flan_t5_lora_model"

# Load base model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_dir)
base_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

# Load LoRA-adapted model
model = PeftModel.from_pretrained(base_model, model_dir)
model.eval()
import torch

def correct_grammar(text):
    input_text = "correct grammar: " + text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=64)

    corrected = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected
test_sentence = "A glass for to sitting next wine a bottle."
corrected_sentence = correct_grammar(test_sentence)

print("🟩 Input :", test_sentence)
print("🟨 Corrected:", corrected_sentence)


🟩 Input : A glass for to sitting next wine a bottle.
🟨 Corrected: A glass sitting next to a wine bottle.


In [None]:
!pip install rouge_score
from datasets import load_metric
import numpy as np

bleu = load_metric("bleu")
rouge = load_metric("rouge")

# Sample evaluation set (small)
inputs = val_dataset["input"][:100]
targets = val_dataset["output"][:100]

predictions = []

# Generate predictions
for text in inputs:
    input_ids = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(model.device)
    with torch.no_grad():
        output = model.generate(**input_ids, max_length=64)
    pred = tokenizer.decode(output[0], skip_special_tokens=True)
    predictions.append(pred)

# Prepare references for BLEU
references = [[t.split()] for t in targets]
predictions_bleu = [p.split() for p in predictions]

# Compute BLEU
bleu_score = bleu.compute(predictions=predictions_bleu, references=references)
print("🟨 BLEU Score:", bleu_score["bleu"])

# Compute ROUGE
rouge_score = rouge.compute(predictions=predictions, references=targets)
print("🟥 ROUGE-L:", rouge_score["rougeL"].mid.fmeasure)


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=7957f05d22273bfc3ce503ad7460715f2146ccea3fc3799234b308b2794a110e
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
🟨 BLEU Score: 0.6972650120115986
🟥 ROUGE-L: 0.90135989778681


In [None]:
!pip install sacrebleu --quiet
!pip install evaluate rouge_score --quiet

from datasets import load_dataset
import evaluate
import numpy as np
import torch

# Load metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")
chrf = evaluate.load("chrf")

# Small evaluation set
inputs = val_dataset["input"][:100]
targets = val_dataset["output"][:100]

predictions = []

# Generate predictions
for text in inputs:
    input_ids = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(model.device)
    with torch.no_grad():
        output = model.generate(**input_ids, max_length=64)
    pred = tokenizer.decode(output[0], skip_special_tokens=True)
    predictions.append(pred)

# BLEU needs tokenized input
references = [[t] for t in targets]
predictions_bleu = predictions
bleu_score = bleu.compute(predictions=predictions_bleu, references=references)
print("🟨 BLEU Score:", bleu_score["bleu"])

# ROUGE
rouge_score = rouge.compute(predictions=predictions, references=targets)
print("🟥 ROUGE-L:", rouge_score["rougeL"].mid.fmeasure)

# METEOR
meteor_score = meteor.compute(predictions=predictions, references=targets)
print("🟦 METEOR:", meteor_score["meteor"])

# chrF++
chrf_score = chrf.compute(predictions=predictions, references=targets)
print("🟪 chrF++:", chrf_score["score"])

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


🟪 chrF++: 86.13441499213485


In [None]:
!pip install sacrebleu --quiet