In [1]:
!pip install transformers 
!pip install SentencePiece 


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 31.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 87.2 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 91.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting SentencePiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB

In [2]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 29.3 MB/s 
Collecting dill<0.3.6
  Downloading dill-0.3.5.1-py2.py3-none-any.whl (95 kB)
[K     |████████████████████████████████| 95 kB 5.7 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 86.6 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.14-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 71.4 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 82.8 MB/s 
Collecting multiprocess
  Downloading multiproce

In [3]:
# Importing required libraries
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [4]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'


In [5]:
# Creating a custom dataset for reading the dataframe and loading it into the dataloader to pass it to the neural network at a later stage for finetuning the model and to prepare it for predictions

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.highlights
        self.ctext = self.data.article

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [6]:
# Creating the training function. This will be called in the main process. It is run depending on the epoch value.
# The model is put into train mode and then we enumerate over the training loader and passed to the defined network 

def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]
        
        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


In [7]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [8]:
# Defining some key variables that will be used later on in the training  
TRAIN_BATCH_SIZE = 28    # input batch size for training (default: 64)
VALID_BATCH_SIZE = 28    # input batch size for testing (default: 1000)
TRAIN_EPOCHS = 1        # number of epochs to train (default: 10)
VAL_EPOCHS = 1 
LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
SEED = 42               # random seed (default: 42)
MAX_LEN = 512
SUMMARY_LEN = 150

In [9]:
# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(SEED) # pytorch random seed
np.random.seed(SEED) # numpy random seed
torch.backends.cudnn.deterministic = True

# tokenzier for encoding the text
tokenizer = T5Tokenizer.from_pretrained("t5-base",model_max_length=MAX_LEN)

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

In [10]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail",'3.0.0')

Downloading builder script:   0%|          | 0.00/8.33k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/9.88k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

Downloading and preparing dataset cnn_dailymail/3.0.0 to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [12]:
train_data = pd.DataFrame(dataset['train'])
#test_data = pd.DataFrame(dataset['test'])
val_data = pd.DataFrame(dataset['validation'])
train_data.head()

Unnamed: 0,article,highlights,id
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...,42c027e4ff9730fbb3de84c1af0d2c506e41c3e4
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...,ee8871b15c50d0db17b0179a6d2beab35065f1e9
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa...",06352019a19ae31e527f37f7571c6dd7f0c5da37
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non...",24521a2abb2e1f5e34e6824e0f9e56904a2b0e88
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical...",7fe70cc8b12fab2d0a258fababf7d9c6b5e1262a


In [13]:
train_data = train_data.drop(['id'], axis=1)
train_data = train_data.reset_index(drop=True)
#test_data = test_data.drop(['id'], axis=1)
#test_data = test_data.reset_index(drop=True)
val_data = val_data.drop(['id'], axis=1)
val_data = val_data.reset_index(drop=True)

In [14]:
train_data.head()

Unnamed: 0,article,highlights
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa..."
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non..."
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical..."


In [15]:
train_data.article = 'summarize: ' + train_data.article # add prefix "summarize: " to input indicating the task
print(train_data.head())

                                             article  \
0  summarize: LONDON, England (Reuters) -- Harry ...   
1  summarize: Editor's note: In our Behind the Sc...   
2  summarize: MINNEAPOLIS, Minnesota (CNN) -- Dri...   
3  summarize: WASHINGTON (CNN) -- Doctors removed...   
4  summarize: (CNN)  -- The National Football Lea...   

                                          highlights  
0  Harry Potter star Daniel Radcliffe gets £20M f...  
1  Mentally ill inmates in Miami are housed on th...  
2  NEW: "I thought I was going to die," driver sa...  
3  Five small polyps found during procedure; "non...  
4  NEW: NFL chief, Atlanta Falcons owner critical...  


In [16]:
#test_data.article = 'summarize: ' + test_data.article # add prefix "summarize: " to input indicating the task
val_data.article = 'summarize: ' + val_data.article # add prefix "summarize: " to input indicating the task

In [17]:
#print(test_data.head())
print(val_data.head())

                                             article  \
0  summarize: (CNN)Share, and your gift will be m...   
1  summarize: (CNN)On the 6th of April 1996, San ...   
2  summarize: (CNN)French striker Bafetimbi Gomis...   
3  summarize: (CNN)It was an act of frustration p...   
4  summarize: (CNN)A Pennsylvania community is pu...   

                                          highlights  
0  Zully Broussard decided to give a kidney to a ...  
1  The 20th MLS season begins this weekend .\nLea...  
2  Bafetimbi Gomis collapses within 10 minutes of...  
3  Rory McIlroy throws club into water at WGC Cad...  
4  Cayman Naib, 13, hasn't been heard from since ...  


In [18]:
train_size = 0.3
train_data_r=train_data.sample(frac=train_size,random_state = SEED)
train_data_reduced = train_data_r.reset_index(drop=True)

print("TRAIN Dataset: {}".format(train_data_reduced.shape))
#print("TEST Dataset: {}".format(test_data.shape))
print("Validation Dataset: {}".format(val_data.shape))

TRAIN Dataset: (86134, 2)
Validation Dataset: (13368, 2)


In [19]:
train_data_reduced.head()

Unnamed: 0,article,highlights
0,summarize: Nasa has warned of an impending ast...,2004 BL86 will pass about three times the dist...
1,"summarize: BAGHDAD, Iraq (CNN) -- Iraq's most ...","Iraqi Islamic Party calls Quran incident ""blat..."
2,summarize: By . David Kent . Andy Carroll has ...,Carroll takes to Instagram to post selfie ahea...
3,summarize: Los Angeles (CNN) -- Los Angeles ha...,Pop stars from all over Europe are setting the...
4,summarize: London (CNN) -- Few shows can claim...,NEW: Young athletes light the Olympic cauldron...


In [20]:
# Creating the Training and Validation dataset for further creation of Dataloader
training_set = CustomDataset(train_data_reduced, tokenizer, MAX_LEN, SUMMARY_LEN)
val_set = CustomDataset(val_data, tokenizer, MAX_LEN, SUMMARY_LEN)

In [21]:
# Defining the parameters for creation of dataloaders
train_params = {
    'batch_size': TRAIN_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
     }

val_params = {
    'batch_size': VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
    }

# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)

In [22]:
# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = T5ForConditionalGeneration.from_pretrained("t5-base")
model = model.to(device)

# Defining the optimizer that will be used to tune the weights of the network in the training session. 
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)


Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [23]:
# Training loop
print('Initiating Fine-Tuning for the model on our dataset')

for epoch in range(TRAIN_EPOCHS):
    train(epoch, tokenizer, model, device, training_loader, optimizer)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Initiating Fine-Tuning for the model on our dataset




Epoch: 0, Loss:  6.8505048751831055
Epoch: 0, Loss:  1.6959205865859985
Epoch: 0, Loss:  1.7519334554672241
Epoch: 0, Loss:  1.6832270622253418
Epoch: 0, Loss:  1.768080711364746
Epoch: 0, Loss:  1.5546128749847412
Epoch: 0, Loss:  1.7944238185882568


In [25]:
# Validation loop and saving the resulting file with predictions and acutals in a dataframe.
# Saving the dataframe as predictions.csv
print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
for epoch in range(VAL_EPOCHS):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
    final_df.to_csv('./predictions.csv')
    print('Output Files generated for review')

Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe
Completed 0
Completed 100
Completed 200
Completed 300
Completed 400
Output Files generated for review


In [26]:
final_df

Unnamed: 0,Generated Text,Actual Text
0,"""Thanks for all the support and prayers,"" Zull...",Zully Broussard decided to give a kidney to a ...
1,the MLS is set to mark the beginning of its 20...,The 20th MLS season begins this weekend. Leagu...
2,"Bafetimbi Gomis says he is ""feeling well"" afte...",Bafetimbi Gomis collapses within 10 minutes of...
3,Rory McIlroy pulls his second shot on the eigh...,Rory McIlroy throws club into water at WGC Cad...
4,"Cayman Naib, 13, was last seen wearing a gray ...","Cayman Naib, 13, hasn't been heard from since ..."
...,...,...
13363,"the Wild West Town in Valley Center, San Diego...","The town in Valley Center, San Diego, has been..."
13364,photographer Graham Hewer captured the encount...,Photographer Graham Hewer captured the jaw-dro...
13365,two pigs were greeted on boat by herd of swimm...,The pigs swim through the crystal clear sea an...
13366,NBC decided not to continue with negotiations ...,Kate's sister is back in London following 'cri...


In [27]:
!pip install rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [28]:
gen = final_df['Generated Text']
act = final_df['Actual Text']

In [29]:
#Model performance
from rouge import Rouge
rouge = Rouge()
rouge.get_scores(gen, act, avg=True, ignore_empty=True)

{'rouge-1': {'r': 0.3731740030441034,
  'p': 0.4093629337598407,
  'f': 0.38110656691358613},
 'rouge-2': {'r': 0.16535769825821314,
  'p': 0.18413828502196794,
  'f': 0.16889424237729783},
 'rouge-l': {'r': 0.3518101024082582,
  'p': 0.3860640357474485,
  'f': 0.35935929507107783}}

In [24]:
model.save_pretrained("/content/model output")
