
# **Install libraries**

In [None]:
!pip install accelerate
!pip install rouge_score
!pip install sentencepiece
!pip install accelerate==0.20.1
!pip install transformers[torch]==4.16.0
!pip install datasets tqdm pandas
!pip install wandb

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm

In [6]:
# Check we have a GPU and check the memory size of the GPU
!nvidia-smi

Sun Jan  7 19:22:15 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# **Import packages**

In [1]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# **Set a seed**

In [2]:
import random
import numpy as np
import torch
import datasets

In [3]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)

set_seed(42)



```
# This is formatted as code
```

# ***C4-200M dataset***

In [4]:
pd.set_option('display.max_colwidth', None)

In [5]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/c4_200m_550k.csv')
df.shape

(550000, 2)

In [6]:
df.head()

Unnamed: 0,input,output
0,The steps below describe how to remove data for one or more specifies areas and how to put on the data from a snapshot to the index,The steps below describe how to remove data for one ore more specific areas and how to put back the data from a snapshot to the index.
1,When I wake up it\'s usually comes out dreamsI\'m thinking so my thoughts are very weird.,When I wake up it\'s usually dreams I\'m thinking about so my thoughts are very weird.
2,One of the cardinal factors to be considered trying to decide on which kind of shipping to customer settle is the! market difference.,One of the cardinal factors to consider when trying to decide on which kind of shipping to settle for is the market difference.
3,Answers » Regions » Is in Nagorno-Karabakt region that part in Armenia?,Answers » Regions » Is Nagorno-Karabakh region part of Armenia?
4,Flaneuring in fun at maple creek SK!,Flaneuring Fun in Maple Creek SK!


In [7]:
from transformers import (
    T5ForConditionalGeneration, T5Tokenizer,
    Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
  )

from torch.utils.data import Dataset, DataLoader

In [8]:
model_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

In [9]:
def calc_token_len(example):
    return len(tokenizer(example).input_ids)

In [10]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.10, shuffle=True)
train_df.shape, test_df.shape

((495000, 2), (55000, 2))

In [11]:
test_df['input_token_len'] = test_df['input'].apply(calc_token_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (563 > 512). Running this sequence through the model will result in indexing errors


In [12]:
test_df.head()

Unnamed: 0,input,output,input_token_len
486496,You are My Fantasy. And My Reality. So good. ...Sated.,"You are My Fantasy. And, My Reality. So good. ...Sated.",18
58208,I have always found it be good with.,I have always found it to be good.,10
265247,"The average rock you would pick up has an SG of about 2.75 because all the earth’s bulk crust is made up of quartz, calcite & feldspar.","The average rock you would pick up has an SG of about 2.75. Because most of the earth’s crust is made up of quartz, calcite & feldspar.",40
499297,"Bronzes, Mirrors and paintings, fine some art of Sydney.","Bronzes, Mirrors and Paintings, some of the finest in Sydney.",15
124684,Is this how america became under Donald Trump?,Is this what America has become under Donald Trump?,12


In [13]:
test_df['input_token_len'].describe()

count    55000.000000
mean        33.536127
std         25.961464
min          2.000000
25%         17.000000
50%         27.000000
75%         42.000000
max        918.000000
Name: input_token_len, dtype: float64

### We will use a token length of 64 since it will cover the vast majority of examples

In [14]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [15]:
test_dataset

Dataset({
    features: ['input', 'output', 'input_token_len', '__index_level_0__'],
    num_rows: 55000
})

### Load the Dataset

In [16]:
from torch.utils.data import Dataset, DataLoader
class GrammarDataset(Dataset):
    def __init__(self, dataset, tokenizer,print_text=False):
        self.dataset = dataset
        self.pad_to_max_length = False
        self.tokenizer = tokenizer
        self.print_text = print_text
        self.max_len = 64

    def __len__(self):
        return len(self.dataset)


    def tokenize_data(self, example):
        input_, target_ = example['input'], example['output']

        # tokenize inputs
        tokenized_inputs = tokenizer(input_, pad_to_max_length=self.pad_to_max_length,
                                            max_length=self.max_len,
                                            return_attention_mask=True)

        tokenized_targets = tokenizer(target_, pad_to_max_length=self.pad_to_max_length,
                                            max_length=self.max_len,
                                            return_attention_mask=True)

        inputs={"input_ids": tokenized_inputs['input_ids'],
            "attention_mask": tokenized_inputs['attention_mask'],
            "labels": tokenized_targets['input_ids']
        }

        return inputs


    def __getitem__(self, index):
        inputs = self.tokenize_data(self.dataset[index])

        if self.print_text:
            for k in inputs.keys():
                print(k, len(inputs[k]))

        return inputs

In [17]:
dataset = GrammarDataset(test_dataset, tokenizer, True)
print(dataset[121])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


input_ids 20
attention_mask 20
labels 24
{'input_ids': [71, 973, 24, 14079, 24067, 38, 96, 77, 221, 3728, 121, 19, 59, 2930, 7509, 640, 569, 2287, 5, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [71, 973, 24, 14079, 3, 89, 12578, 887, 21, 96, 77, 221, 3728, 121, 3270, 19, 59, 2930, 7509, 640, 569, 2287, 5, 1]}


### Define Evaluator

In [18]:
from datasets import load_metric
rouge_metric = load_metric("rouge")

  rouge_metric = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


### Train Model

In [19]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding='longest', return_tensors='pt')

In [20]:
# defining training related arguments
batch_size = 16
args = Seq2SeqTrainingArguments(output_dir="/content/drive/MyDrive/c4_200m/weights",
                        evaluation_strategy="steps",
                        per_device_train_batch_size=batch_size,
                        per_device_eval_batch_size=batch_size,
                        learning_rate=2e-5,
                        num_train_epochs=1,
                        weight_decay=0.01,
                        save_total_limit=2,
                        predict_with_generate=True,
                        fp16 = True,
                        gradient_accumulation_steps = 6,
                        eval_steps = 500,
                        save_steps = 500,
                        load_best_model_at_end=True,
                        logging_dir="/logs",
                        report_to="wandb")

In [21]:
import nltk
nltk.download('punkt')
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [22]:
# defining trainer using 🤗
trainer = Seq2SeqTrainer(model=model,
                args=args,
                train_dataset= GrammarDataset(train_dataset, tokenizer),
                eval_dataset=GrammarDataset(test_dataset, tokenizer),
                tokenizer=tokenizer,
                data_collator=data_collator,
                compute_metrics=compute_metrics)

Using amp half precision backend


In [23]:
trainer.train()

***** Running training *****
  Num examples = 495000
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 96
  Gradient Accumulation steps = 6
  Total optimization steps = 5156
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
500,0.7635,0.627395,71.3992,60.9855,70.6459,70.6708,17.3042


***** Running Evaluation *****
  Num examples = 55000
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/c4_200m/weights/checkpoint-500
Configuration saved in /content/drive/MyDrive/c4_200m/weights/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/c4_200m/weights/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/c4_200m/weights/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/c4_200m/weights/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 55000
  Batch size = 16


KeyboardInterrupt: 

In [24]:
trainer.save_model('t5_gec_model')

Saving model checkpoint to t5_gec_model
Configuration saved in t5_gec_model/config.json
Model weights saved in t5_gec_model/pytorch_model.bin
tokenizer config file saved in t5_gec_model/tokenizer_config.json
Special tokens file saved in t5_gec_model/special_tokens_map.json


In [33]:
!zip -r 't5_gec_model.zip' 't5_gec_model'

  adding: t5_gec_model/ (stored 0%)
  adding: t5_gec_model/pytorch_model.bin (deflated 8%)
  adding: t5_gec_model/tokenizer_config.json (deflated 76%)
  adding: t5_gec_model/special_tokens_map.json (deflated 83%)
  adding: t5_gec_model/spiece.model (deflated 48%)
  adding: t5_gec_model/config.json (deflated 62%)
  adding: t5_gec_model/training_args.bin (deflated 51%)


In [None]:
!mv t5_gec_model.zip /content/drive/MyDrive/c4_200m

I have uploaded this model to HuggingFace Model Zoo and we can run inference using it

## Testing

In [25]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
model_name = 'deep-learning-analytics/GrammarCorrector'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(torch_device)

def correct_grammar(input_text,num_return_sequences):
  batch = tokenizer([input_text],truncation=True,padding='max_length',max_length=64, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=64,num_beams=4, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

https://huggingface.co/deep-learning-analytics/GrammarCorrector/resolve/main/tokenizer_config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpti7mz_r2


Downloading:   0%|          | 0.00/2.12k [00:00<?, ?B/s]

storing https://huggingface.co/deep-learning-analytics/GrammarCorrector/resolve/main/tokenizer_config.json in cache at /root/.cache/huggingface/transformers/dc750a9a9486f2dcf2146ef11943316eb8da4d4dd91ab0528efdb7d8a0c83e52.4c6871a59a0298c737e0823ca4aa7e1fa130bd032f33c276d8bbab13b157ef26
creating metadata file for /root/.cache/huggingface/transformers/dc750a9a9486f2dcf2146ef11943316eb8da4d4dd91ab0528efdb7d8a0c83e52.4c6871a59a0298c737e0823ca4aa7e1fa130bd032f33c276d8bbab13b157ef26
https://huggingface.co/deep-learning-analytics/GrammarCorrector/resolve/main/spiece.model not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpa03vr781


Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

storing https://huggingface.co/deep-learning-analytics/GrammarCorrector/resolve/main/spiece.model in cache at /root/.cache/huggingface/transformers/abc7e3d8e075233ce511ed51fe046c478bd84675e124e3444a12bf37cf37a0ea.d6f0605ae3d57070be74b4c12206072ab332922acff822e6b5458691dbda7551
creating metadata file for /root/.cache/huggingface/transformers/abc7e3d8e075233ce511ed51fe046c478bd84675e124e3444a12bf37cf37a0ea.d6f0605ae3d57070be74b4c12206072ab332922acff822e6b5458691dbda7551
https://huggingface.co/deep-learning-analytics/GrammarCorrector/resolve/main/special_tokens_map.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp2cbmucv9


Downloading:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

storing https://huggingface.co/deep-learning-analytics/GrammarCorrector/resolve/main/special_tokens_map.json in cache at /root/.cache/huggingface/transformers/fd4d20134b5b76af052d9e6abd5751e2886f57ff167900bf3654839bf1587f56.c94798918c92ded6aeef2d2f0e666d2cc4145eca1aa6e1336fde07f2e13e2f46
creating metadata file for /root/.cache/huggingface/transformers/fd4d20134b5b76af052d9e6abd5751e2886f57ff167900bf3654839bf1587f56.c94798918c92ded6aeef2d2f0e666d2cc4145eca1aa6e1336fde07f2e13e2f46
https://huggingface.co/deep-learning-analytics/GrammarCorrector/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpf81vol90


Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

storing https://huggingface.co/deep-learning-analytics/GrammarCorrector/resolve/main/tokenizer.json in cache at /root/.cache/huggingface/transformers/5e630397d39ba3ebf132d859d7d276ed68debe88ca9240ae2291f3e142509707.8627f1bd5d270a9fd2e5a51c8bec3223896587cc3cfe13edeabb0992ab43c529
creating metadata file for /root/.cache/huggingface/transformers/5e630397d39ba3ebf132d859d7d276ed68debe88ca9240ae2291f3e142509707.8627f1bd5d270a9fd2e5a51c8bec3223896587cc3cfe13edeabb0992ab43c529
loading file https://huggingface.co/deep-learning-analytics/GrammarCorrector/resolve/main/spiece.model from cache at /root/.cache/huggingface/transformers/abc7e3d8e075233ce511ed51fe046c478bd84675e124e3444a12bf37cf37a0ea.d6f0605ae3d57070be74b4c12206072ab332922acff822e6b5458691dbda7551
loading file https://huggingface.co/deep-learning-analytics/GrammarCorrector/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/deep-learning-analytics/GrammarCorrector/resolve/main/special_tokens_map.json

Downloading:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

storing https://huggingface.co/deep-learning-analytics/GrammarCorrector/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/69acb7accf375d084bf55951b3d669751cda6736a5ca11dccf78cff4b11bf704.2cd6c2ba104d17bd648d124109c7f8ea74a8a75fdfb8f4e22b36a6413a83c663
creating metadata file for /root/.cache/huggingface/transformers/69acb7accf375d084bf55951b3d669751cda6736a5ca11dccf78cff4b11bf704.2cd6c2ba104d17bd648d124109c7f8ea74a8a75fdfb8f4e22b36a6413a83c663
loading configuration file https://huggingface.co/deep-learning-analytics/GrammarCorrector/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/69acb7accf375d084bf55951b3d669751cda6736a5ca11dccf78cff4b11bf704.2cd6c2ba104d17bd648d124109c7f8ea74a8a75fdfb8f4e22b36a6413a83c663
Model config T5Config {
  "_name_or_path": "deep-learning-analytics/GrammarCorrector",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dr

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

storing https://huggingface.co/deep-learning-analytics/GrammarCorrector/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/0ac2c3f4f789323d5a6ebec02da2a362c18594e6b92756e7ccfae83fbf87488a.72138e9413fef7b7239e513f64f412c9c9fa9309d0321becfbf645afbef17b76
creating metadata file for /root/.cache/huggingface/transformers/0ac2c3f4f789323d5a6ebec02da2a362c18594e6b92756e7ccfae83fbf87488a.72138e9413fef7b7239e513f64f412c9c9fa9309d0321becfbf645afbef17b76
loading weights file https://huggingface.co/deep-learning-analytics/GrammarCorrector/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/0ac2c3f4f789323d5a6ebec02da2a362c18594e6b92756e7ccfae83fbf87488a.72138e9413fef7b7239e513f64f412c9c9fa9309d0321becfbf645afbef17b76
All model checkpoint weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at deep-learning-analytics/GrammarCorrector.
If y

In [26]:
text = 'He are moving here.'
print(correct_grammar(text, num_return_sequences=2))

['He is moving here.', "He's moving here."]


In [32]:
text = 'he an great doc'
print(correct_grammar(text, num_return_sequences=1))

['he is a great doc.']
