In [15]:
pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (7.7 kB)
Downloading sentencepiece-0.2.0-cp312-cp312-macosx_11_0_arm64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0
Note: you may need to restart the kernel to use updated packages.


# Importing Libraries

In [1]:
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW

## Model Loading

In [4]:
# Step 1: Load the T5 Model and Tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = "t5-small" 
tokenizer = T5Tokenizer.from_pretrained(model, legacy=False)
model = T5ForConditionalGeneration.from_pretrained(model).to(device)

# Loading the data

In [7]:
# Step 2: Load the Dataset from CSV
csv_file = "Cleaned_Lang8.csv" 
data = pd.read_csv(csv_file)

In [9]:
data = data.rename(columns = {'0': "input", '1': "target"})
data

Unnamed: 0,input,target
0,the president was standing in the front row an...,the president was standing in the front row an...
1,"in us, i heard that arrested people will not l...","in the us, i heard that people arrested will n..."
2,"i always practice pronounce, but getting more ...","i always practise pronunciation, but it is get..."
3,"if i said that sentence, people will be heard ...","if i said that sentence, people would hear the..."
4,i just learnt alphabet.,i have just learnt the alphabet.
...,...,...
199995,"she does not want to, but her father did not f...","she did not want to, but her father did not bu..."
199996,"anyway, we feel grief, but on the other hand, ...","anyway, we feel grief, but on the other hand, ..."
199997,other countries also lose your grandfather did?,did other countries also lose your grandfather?
199998,"it is my first time to be here, i mean lang-8.",it is my first time here. i mean at lang-8.


In [11]:
data = data.iloc[:1000, :]

In [61]:
if "input" not in data.columns or "target" not in data.columns:
    raise ValueError("The CSV file must contain 'input' and 'target' columns.")

In [63]:
# Step 3: Preprocess the Data
input_texts = ["grammar correction: " + text for text in data["input"].tolist()]
target_texts = data["target"].tolist()

In [65]:
input_texts

['grammar correction: the president was standing in the front row and the every female enployees were surrounding him.',
 'grammar correction: in us, i heard that arrested people will not lose their jobs.',
 'grammar correction: i always practice pronounce, but getting more difficult.',
 'grammar correction: if i said that sentence, people will be heard same pronounciation.',
 'grammar correction: i just learnt alphabet.',
 "grammar correction: i'm always not patient enough to take a self - study for a long time.",
 'grammar correction: but it is also said that it is very difficult to learn well both of them.',
 'grammar correction: then i asked my brother for an advice, he said that “ i like japanese.',
 "grammar correction: i was acquainted with him in a chinese's room chat when i was at 9th grade about 4 years ago.",
 'grammar correction: he is very humorious and handsome > _ <.',
 'grammar correction: that is the reason why i did not choose english department despite i like it best

In [67]:
# Tokenize the input and target texts
input_encodings = tokenizer(
    input_texts, padding=True, truncation=True, max_length=128, return_tensors="pt"
).to(device)
target_encodings = tokenizer(
    target_texts, padding=True, truncation=True, max_length=128, return_tensors="pt"
).to(device)

In [69]:
# Prepare labels with padding
labels = target_encodings["input_ids"]
labels[labels == tokenizer.pad_token_id] = -100

In [71]:
# Step 4: Create a PyTorch Dataset and DataLoader
class GrammarCorrectionDataset(Dataset):
    def __init__(self, input_encodings, labels):
        self.input_encodings = input_encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_encodings["input_ids"][idx],
            "attention_mask": self.input_encodings["attention_mask"][idx],
            "labels": self.labels[idx],
        }

dataset = GrammarCorrectionDataset(input_encodings, labels)
train_loader = DataLoader(dataset, batch_size=4, shuffle=True)

In [73]:
# Step 5: Fine-Tune the Model
optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
epochs = 3
for epoch in range(epochs):
    epoch_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss / len(train_loader):.4f}")

Epoch 1/3, Loss: 0.9998
Epoch 2/3, Loss: 0.7807
Epoch 3/3, Loss: 0.6884


In [74]:
# Step 6: Save the Fine-Tuned Model
model.save_pretrained("t5-grammar-correction")
tokenizer.save_pretrained("t5-grammar-correction")

print("Model fine-tuning completed and saved!")


Model fine-tuning completed and saved!


In [75]:
# Step 7: Grammar Correction Inference
def correct_grammar(sentence, model, tokenizer):
    input_text = "grammar correction: " + sentence
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    outputs = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_text

In [76]:
# Example Usage
model.eval()
test_sentences = [
    "recently, i'm immerced in running for my weight control.",
    "the every female enployees were surrounding him.",
]
for sentence in test_sentences:
    print(f"Original: {sentence}")
    print(f"Corrected: {correct_grammar(sentence, model, tokenizer)}\n")


Original: recently, i'm immerced in running for my weight control.
Corrected: recently, i'm immersed in running for my weight control.

Original: the every female enployees were surrounding him.
Corrected: the every female enployee was surrounding him.



In [81]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the Fine-Tuned Model and Tokenizer
model_path = "t5-grammar-correction"  # Directory where the fine-tuned model is saved
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path).to(device)

# Function for Grammar Correction
def correct_grammar(sentence, model, tokenizer):
    input_text = "grammar correction: " + sentence
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    outputs = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_text

# Main Loop for Manual Input
print("Grammar Correction using T5 Model")
print("Type 'exit' to stop the program.\n")

while True:
    sentence = input("Enter a sentence to correct: ").strip()
    if sentence.lower() == "exit":
        print("Exiting the program. Goodbye!")
        break
    corrected_sentence = correct_grammar(sentence, model, tokenizer)
    print(f"Corrected Sentence: {corrected_sentence}\n")


Grammar Correction using T5 Model
Type 'exit' to stop the program.



Enter a sentence to correct:  I are surprised in this genration


Corrected Sentence: I am surprised at this genration.



Enter a sentence to correct:  He was tried to ran in the harshee weather


Corrected Sentence: he was tried to run in the harsh weather.



Enter a sentence to correct:  exit


Exiting the program. Goodbye!


In [85]:
pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
Downloading multiprocess-0.70.16-py312-none-any.whl (146 kB)
Downloading xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl (30 kB)
Installing collected packages: xxhash, multiprocess, datasets
Successfully installed datasets-3.2.0 multiprocess-0.70.16 xxhash-3.5.0
Note: you may need to restart the kernel to use updated packages.


In [89]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


In [17]:
pip install rouge_score absl-py

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting absl-py
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Downloading absl_py-2.1.0-py3-none-any.whl (133 kB)
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=984a62ebbf3cdc023702a3076d3daa8e0c8d9c5cec6b14f7501820b3dbcbbb2c
  Stored in directory: /Users/usmanali/Library/Caches/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: absl-py, rouge_score
Successfully installed absl-py-2.1.0 rouge_score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [27]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import evaluate
import pandas as pd

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load Fine-Tuned Model and Tokenizer
model_path = "t5-grammar-correction"  # Replace with your model directory
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path).to(device)

# Load Dataset (CSV file with columns: 'input_text', 'corrected_text')
#data_path = "test_dataset.csv"  # Path to your dataset
df = data

# Initialize Metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
accuracy = 0
total_samples = len(df)

# Ensure predictions and references are strings
predictions = []
references = []

for _, row in df.iterrows():
    input_text = "grammar correction: " + row['input']
    expected_output = row['target']

    # Tokenize and generate the model output
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    outputs = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Collect predictions and references as strings
    predictions.append(predicted_text.strip())
    references.append(expected_output.strip())  # Ensure this is also a plain string

# Evaluate with ROUGE
rouge_result = rouge.compute(predictions=predictions, references=references)

# Display Results
print("ROUGE Evaluation Results:")
for key, value in rouge_result.items():
    print(f"{key}: {value:.4f}")

ROUGE Evaluation Results:
rouge1: 0.8427
rouge2: 0.6689
rougeL: 0.8285
rougeLsum: 0.8285
