In [None]:
pip install 'accelerate>=0.26.0'

SyntaxError: invalid syntax (4182730927.py, line 1)

In [10]:
pip install transformers

Collecting transformers
  Using cached transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp312-cp312-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.7 kB)
Using cached transformers-4.46.3-py3-none-any.whl (10.0 MB)
Downloading regex-2024.11.6-cp312-cp312-macosx_11_0_arm64.whl (284 kB)
Downloading tokenizers-0.20.3-cp312-cp312-macosx_11_0_arm64.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: regex, tokenizers, transformers
Successfully installed regex-2024.11.6 tokenizers-0.20.3 transformers-4.46.3
Note: you may need to restart the kernel to use updated packages.


In [11]:

import torch
from transformers import T5ForConditionalGeneration, AutoTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback
from torch.utils.data import Dataset
import gc
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Free up memory
gc.collect()
torch.cuda.empty_cache()

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

# Load the datasets
def read_sentences(src_path, tgt_path, limit=None):
    with open(src_path, "r", encoding="utf-8") as src_file, open(tgt_path, "r", encoding="utf-8") as tgt_file:
        incorrect_sentences = [line.strip() for line in tqdm(src_file.readlines(), desc="Reading Incorrect Sentences")]
        correct_sentences = [line.strip() for line in tqdm(tgt_file.readlines(), desc="Reading Correct Sentences")]
    if limit:
        incorrect_sentences = incorrect_sentences[:limit]
        correct_sentences = correct_sentences[:limit]
    return incorrect_sentences, correct_sentences

# Paths for training, validation, and test datasets
train_src_path = "wikiExtractsData/data/train_merge.src"
train_tgt_path = "wikiExtractsData/data/train_merge.tgt"
valid_src_path = "wikiExtractsData/data/valid.src"
valid_tgt_path = "wikiExtractsData/data/valid.tgt"
test_src_path = "Wiki-edits/hiwiki.extracted.clean.src"
test_tgt_path = "Wiki-edits/hiwiki.extracted.clean.trg"


# Load Training, Validation, and Test Data
train_incorrect, train_correct = read_sentences(train_src_path, train_tgt_path, limit=1000)
valid_incorrect, valid_correct = read_sentences(valid_src_path, valid_tgt_path)
test_incorrect, test_correct = read_sentences(test_src_path, test_tgt_path)

# Define batch size for tokenization
batch_size = 500

# Tokenize sentences in smaller batches to avoid memory overload
def tokenize_sentences(input_sentences, target_sentences):
    input_ids_list, labels_list = [], []

    for i in tqdm(range(0, len(input_sentences), batch_size), desc="Batch Tokenizing Sentences"):
        batch_inputs = input_sentences[i:i+batch_size]
        batch_targets = target_sentences[i:i+batch_size]
        
        tokenized_inputs = tokenizer(batch_inputs, padding="max_length", truncation=True, max_length=64, return_tensors="pt")
        tokenized_targets = tokenizer(batch_targets, padding="max_length", truncation=True, max_length=64, return_tensors="pt")
        
        input_ids_list.append(tokenized_inputs["input_ids"])
        labels_list.append(tokenized_targets["input_ids"])

    tokenized_inputs = {"input_ids": torch.cat(input_ids_list, dim=0)}
    labels = torch.cat(labels_list, dim=0)
    del input_ids_list, labels_list
    gc.collect()
    return tokenized_inputs, labels

# Tokenize training, validation, and test sentences
train_tokenized_inputs, train_labels = tokenize_sentences(train_incorrect, train_correct)
valid_tokenized_inputs, valid_labels = tokenize_sentences(valid_incorrect, valid_correct)
test_tokenized_inputs, test_labels = tokenize_sentences(test_incorrect, test_correct)

# Define Dataset with Pre-tokenized Inputs
class PreTokenizedSentenceDataset(Dataset):
    def __init__(self, tokenized_inputs, labels):
        self.tokenized_inputs = tokenized_inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.tokenized_inputs.items()}
        item["labels"] = self.labels[idx]
        return item

# Create datasets for training, validation, and testing
train_dataset = PreTokenizedSentenceDataset(train_tokenized_inputs, train_labels)
valid_dataset = PreTokenizedSentenceDataset(valid_tokenized_inputs, valid_labels)
test_dataset = PreTokenizedSentenceDataset(test_tokenized_inputs, test_labels)

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=1,  # Reduced batch size to save memory
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=1,
    load_best_model_at_end=True,
    greater_is_better=True,
    fp16=False,  # Disable mixed precision for stability
    gradient_accumulation_steps=8  # Increase gradient accumulation to make up for smaller batch size
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

# Train the Model
trainer.train()

# Evaluate on the validation dataset
print("Evaluating on validation dataset:")
validation_results = trainer.evaluate()
print(validation_results)

# Evaluate on the test dataset
print("Evaluating on test dataset:")
test_results = trainer.evaluate(eval_dataset=test_dataset)
print(test_results)

# Save the Model
model.save_pretrained("./sentence_correction_model")
tokenizer.save_pretrained("./sentence_correction_model")


Using device: cpu


Reading Incorrect Sentences: 100%|██████████| 2607757/2607757 [00:00<00:00, 4341356.17it/s]
Reading Correct Sentences: 100%|██████████| 2607757/2607757 [00:00<00:00, 5684856.07it/s]
Reading Incorrect Sentences: 100%|██████████| 521552/521552 [00:00<00:00, 6519406.22it/s]
Reading Correct Sentences: 100%|██████████| 521552/521552 [00:00<00:00, 6120418.22it/s]
Reading Incorrect Sentences: 100%|██████████| 13187/13187 [00:00<00:00, 6980977.77it/s]
Reading Correct Sentences: 100%|██████████| 13187/13187 [00:00<00:00, 7166401.51it/s]
Batch Tokenizing Sentences: 100%|██████████| 2/2 [00:00<00:00, 20.64it/s]
Batch Tokenizing Sentences: 100%|██████████| 1044/1044 [00:33<00:00, 31.23it/s]
Batch Tokenizing Sentences: 100%|██████████| 27/27 [00:01<00:00, 24.84it/s]
 13%|█▎        | 50/375 [00:31<03:09,  1.71it/s]

{'loss': 0.9143, 'grad_norm': 26.491437911987305, 'learning_rate': 4.3333333333333334e-05, 'epoch': 0.4}


 27%|██▋       | 100/375 [01:00<02:43,  1.68it/s]

{'loss': 0.168, 'grad_norm': 7.863282680511475, 'learning_rate': 3.6666666666666666e-05, 'epoch': 0.8}


 33%|███▎      | 125/375 [01:15<02:27,  1.69it/s]

KeyboardInterrupt: 

In [None]:
# Make Predictions with the Trained Model
model = T5ForConditionalGeneration.from_pretrained("./sentence_correction_model").to(device)
tokenizer = AutoTokenizer.from_pretrained("./sentence_correction_model")

# Predict for a new incorrect sentence
incorrect_sentence = "उसकी प्रतिभा की गहराई किसी अनजाने समुद्र जैसा है"

# Tokenize the incorrect sentence
inputs = tokenizer(
    incorrect_sentence,
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=64
).to(device)

# Generate corrected sentence with adjusted parameters
with torch.no_grad():
    output_ids = model.generate(
        input_ids=inputs["input_ids"],
        max_length=128,  # Increased max_length to ensure complete generation
        num_beams=7,     # Increased num_beams for a more exhaustive search
        early_stopping=True
    )

# Decode the output to get the corrected sentence
corrected_sentence = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(f"Corrected Sentence: {corrected_sentence}")



Corrected Sentence:      .


: 