In [None]:
!pip install transformers[torch]
!pip install SentencePiece


Collecting transformers[torch]
  Using cached transformers-4.39.3-py3-none-any.whl.metadata (134 kB)
Collecting filelock (from transformers[torch])
  Using cached filelock-3.13.4-py3-none-any.whl.metadata (2.8 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers[torch])
  Using cached huggingface_hub-0.22.2-py3-none-any.whl.metadata (12 kB)
Collecting numpy>=1.17 (from transformers[torch])
  Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting regex!=2019.12.17 (from transformers[torch])
  Downloading regex-2024.4.16-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m925.8 kB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers[torch])
  Using cached tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from tra

In [None]:
# Importing required libraries
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.cuda.amp import GradScaler, autocast
from transformers import Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
hf_token = "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"

In [None]:
scaler = GradScaler()


In [None]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'


In [None]:
# Creating a custom dataset for reading the dataframe and loading it into the dataloader to pass it to the neural network at a later stage for finetuning the model and to prepare it for predictions
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.text
        self.ctext = self.data.ctext

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long),
            'source_mask': source_mask.to(dtype=torch.long),
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [None]:
# Creating the training function. This will be called in the main process. It is run depending on the epoch value.
# The model is put into train mode and then we enumerate over the training loader and passed to the defined network

def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]

        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


In [None]:
# def train(epoch, tokenizer, model, device, loader, optimizer):
#     model.train()
#     for _, data in enumerate(loader, 0):
#         y = data['target_ids'].to(device, dtype=torch.long)
#         y_ids = y[:, :-1].contiguous()
#         lm_labels = y[:, 1:].clone().detach()
#         lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
#         ids = data['source_ids'].to(device, dtype=torch.long)
#         mask = data['source_mask'].to(device, dtype=torch.long)

#         # Automatic Mixed Precision context
#         with autocast():
#             outputs = model(input_ids=ids, attention_mask=mask, decoder_input_ids=y_ids, labels=lm_labels)
#             loss = outputs.loss

#         if _ % 500 == 0:
#             print(f'Epoch: {epoch}, Loss:  {loss.item()}')

#         optimizer.zero_grad()
#         # Use scaler to scale the loss for backward pass
#         scaler.scale(loss).backward()
#         scaler.step(optimizer)
#         scaler.update()


In [None]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask,
                max_length=150,
                num_beams=2,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [None]:
# Defining some key variables that will be used later on in the training
TRAIN_BATCH_SIZE = 4    # input batch size for training (default: 64)
VALID_BATCH_SIZE = 4    # input batch size for testing (default: 1000)
TRAIN_EPOCHS = 10        # number of epochs to train (default: 10)
VAL_EPOCHS = 1
LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
SEED = 42               # random seed (default: 42)
MAX_LEN = 256
SUMMARY_LEN = 256

In [None]:
# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(SEED) # pytorch random seed
np.random.seed(SEED) # numpy random seed
torch.backends.cudnn.deterministic = True

# tokenzier for encoding the text
tokenizer = T5Tokenizer.from_pretrained("t5-large")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
df = pd.read_csv('training_ds.csv')
df.head()

Unnamed: 0,ID,Item Number,Item Text,Summary
0,0,Item 1,Item 1. Business\nCompany Background\nThe Comp...,"**Summary:**\n\nApple Inc. designs, manufactur..."
1,1,Item 1A,"Item 1A. Risk Factors\nThe Company’s business,...",The text outlines various risk factors that co...
2,2,Item 1B,Item 1B. Unresolved Staff Comments\nNone.,The text indicates there are no unresolved sta...
3,3,Item 2,Item 2. Properties\nThe Company’s headquarters...,"As of September 25, 2021, the company's headqu..."
4,4,Item 3,Item 3. Legal Proceedings\nThe Company is subj...,The text describes legal proceedings involving...


In [None]:
df = df.rename(columns={'Item Text': 'ctext', 'Summary':'text'})
df = df[['text','ctext']]
df.ctext = 'summarize: ' + df.ctext # add prefix "summarize: " to input indicating the task
print(df.head())

                                                text  \
0  **Summary:**\n\nApple Inc. designs, manufactur...   
1  The text outlines various risk factors that co...   
2  The text indicates there are no unresolved sta...   
3  As of September 25, 2021, the company's headqu...   
4  The text describes legal proceedings involving...   

                                               ctext  
0  summarize: Item 1. Business\nCompany Backgroun...  
1  summarize: Item 1A. Risk Factors\nThe Company’...  
2  summarize: Item 1B. Unresolved Staff Comments\...  
3  summarize: Item 2. Properties\nThe Company’s h...  
4  summarize: Item 3. Legal Proceedings\nThe Comp...  


In [None]:
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state = SEED)
val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(val_dataset.shape))

FULL Dataset: (2248, 2)
TRAIN Dataset: (1798, 2)
TEST Dataset: (450, 2)


In [None]:
# Creating the Training and Validation dataset for further creation of Dataloader
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)
val_set = CustomDataset(val_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)

In [None]:
# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_set, batch_size=VALID_BATCH_SIZE, shuffle=False)

In [None]:
# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary.
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = T5ForConditionalGeneration.from_pretrained("t5-large")
model = model.to(device)

# Defining the optimizer that will be used to tune the weights of the network in the training session.
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)


In [None]:
# Training loop (taking around 22 mins)
print('Initiating Fine-Tuning for the model on our dataset')

for epoch in range(TRAIN_EPOCHS):
    train(epoch, tokenizer, model, device, training_loader, optimizer)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Initiating Fine-Tuning for the model on our dataset




Epoch: 0, Loss:  9.496197700500488
Epoch: 1, Loss:  2.243647813796997
Epoch: 2, Loss:  1.08051598072052
Epoch: 3, Loss:  0.7972398996353149
Epoch: 4, Loss:  1.1123881340026855
Epoch: 5, Loss:  0.42818599939346313
Epoch: 6, Loss:  0.7567224502563477
Epoch: 7, Loss:  0.6454167366027832
Epoch: 8, Loss:  0.3691837787628174
Epoch: 9, Loss:  0.8094171285629272


In [None]:
print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
for epoch in range(VAL_EPOCHS):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})

Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe
Completed 0
Completed 100


In [None]:
print(final_df.head())

                                      Generated Text  \
0  Company's business, reputation, results of ope...   
1  Company is involved in legal proceedings and c...   
2  information regarding executive compensation f...   
3  information regarding security ownership of ce...   
4  information regarding Principal Accountant Fee...   

                                         Actual Text  
0  The text outlines various risk factors that co...  
1  The text describes legal proceedings involving...  
2  The required information on executive compensa...  
3  The text states that the details regarding the...  
4  This section of the document provides informat...  


In [None]:
#to evaluate the generated text using metrics like "bleu" and "rouge"
!pip install evaluate
import evaluate
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

In [None]:
metric = evaluate.load("bleu")
references = [ [a] for a in actuals ]
results = metric.compute(predictions=predictions, references=references, tokenizer=word_tokenize)
results

In [None]:
!pip install rouge_score
metric = evaluate.load('rouge')
results = metric.compute(predictions=predictions, references=references, tokenizer=word_tokenize)
results

In [None]:
model.save_pretrained('t5_large_epochs_10')
tokenizer.save_pretrained('t5_large_tokenizer_epochs_10')


('t5_large_tokenizer_epochs_10/tokenizer_config.json',
 't5_large_tokenizer_epochs_10/special_tokens_map.json',
 't5_large_tokenizer_epochs_10/spiece.model',
 't5_large_tokenizer_epochs_10/added_tokens.json')

In [None]:
repo_name = 'T5-large-10K-summarization'

In [None]:
args = TrainingArguments(
    output_dir='./results',  # where to save model checkpoints
    hub_model_id=f'yatharth97/{repo_name}',  # your HF model repository
    push_to_hub=True,  # enables pushing to hub after training, if you train within this script
    hub_token=hf_token,  # your Hugging Face API token
)

In [None]:
trainer = Trainer(model=model, tokenizer=tokenizer, args=args)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.push_to_hub(
    commit_message="Commit message describing the changes made",
    blocking=True
)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]
[A

[A[A

training_args.bin: 100%|██████████| 4.98k/4.98k [00:00<00:00, 11.5kB/s]
spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 1.37MB/s]
model.safetensors: 100%|██████████| 2.95G/2.95G [01:42<00:00, 28.9MB/s]

Upload 3 LFS files: 100%|██████████| 3/3 [01:42<00:00, 34.23s/it] 


CommitInfo(commit_url='https://huggingface.co/yatharth97/T5-large-10K-summarization/commit/6561bb4a726fc8d79c9ed87e8da1b4cfffdfbc97', commit_message='Commit message describing the changes made', commit_description='', oid='6561bb4a726fc8d79c9ed87e8da1b4cfffdfbc97', pr_url=None, pr_revision=None, pr_num=None)