# Import libraries

In [1]:
import pandas as pd
import numpy as np
import csv

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, TensorDataset, random_split

In [3]:
!pip install -qq transformers

In [4]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup

In [5]:
pip install datasets



In [6]:
import datasets

In [7]:
% pip install sentencepiece



In [8]:
from torch.cuda.amp import autocast, GradScaler

In [9]:
torch.cuda.amp.autocast(enabled=True)

<torch.cuda.amp.autocast_mode.autocast at 0x7f92951c1610>

In [10]:
torch.backends.cudnn.deterministic = True

In [11]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla P100-PCIE-16GB


In [12]:
!nvidia-smi -L

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-4a64efb2-43bb-e554-88e6-7c5f97e28764)


# Loading the data

In [13]:
# csv.field_size_limit()
# csv.field_size_limit(100000000)
# train=pd.read_csv("train_data.csv",engine='python', error_bad_lines=False)
# test=pd.read_csv("test_data.csv")
# val=pd.read_csv("val_data.csv")
# train=train.drop(columns=train.columns[0], axis=1)
# test=test.drop(columns=test.columns[0], axis=1)
# val=val.drop(columns=val.columns[0], axis=1)

In [14]:
train_df = datasets.load_dataset("xsum", split="train")
val_df = datasets.load_dataset("xsum", split="validation")
test_df = datasets.load_dataset("xsum", split="test")

Using custom data configuration default
Reusing dataset xsum (/root/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499)
Using custom data configuration default
Reusing dataset xsum (/root/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499)
Using custom data configuration default
Reusing dataset xsum (/root/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499)


In [15]:
train_df = train_df.remove_columns("id")
test_df=test_df.remove_columns("id")
val_df=val_df.remove_columns("id")

In [16]:
train_df[:1]

{'document': ['Recent reports have linked some France-based players with returns to Wales.\n"I\'ve always felt - and this is with my rugby hat on now; this is not region or WRU - I\'d rather spend that money on keeping players in Wales," said Davies.\nThe WRU provides £2m to the fund and £1.3m comes from the regions.\nFormer Wales and British and Irish Lions fly-half Davies became WRU chairman on Tuesday 21 October, succeeding deposed David Pickering following governing body elections.\nHe is now serving a notice period to leave his role as Newport Gwent Dragons chief executive after being voted on to the WRU board in September.\nDavies was among the leading figures among Dragons, Ospreys, Scarlets and Cardiff Blues officials who were embroiled in a protracted dispute with the WRU that ended in a £60m deal in August this year.\nIn the wake of that deal being done, Davies said the £3.3m should be spent on ensuring current Wales-based stars remain there.\nIn recent weeks, Racing Metro fl

In [17]:
train=pd.DataFrame(train_df)
val=pd.DataFrame(val_df)
test=pd.DataFrame(test_df)

In [18]:
train[:6]

Unnamed: 0,document,summary
0,Recent reports have linked some France-based p...,New Welsh Rugby Union chairman Gareth Davies b...
1,Army explosives experts were called out to dea...,A suspicious package left outside an Alliance ...
2,It has lost its previous triple-A rating from ...,The UK's international reputation for a strong...
3,The warning begins at 22:00 GMT on Saturday an...,The Met Office has issued a yellow weather war...
4,Tony Fisher's body was found by his son in Con...,Two more men have been charged with the murder...
5,"Lam, 28, joined the club in 2014 and, despite ...",Bristol flanker Jack Lam has signed a new two-...


In [19]:
len(train["summary"][1030])

150

In [20]:
train["summary"][1030]

'Double police killer Dale Cregan has been cleared of a final charge of attempted murder at the conclusion of a 18-week trial involving nine other men.'

# Pre-processing 

In [18]:
train["document"]=train["document"].str.strip()
train["summary"]= train["summary"].str.strip()
test["document"]=test["document"].str.strip()
test["summary"]= test["summary"].str.strip()
val["document"]=val["document"].str.strip()
val["summary"]= val["summary"].str.strip()

In [19]:
train=train.loc[train["summary"].str.len() >= 5]
test=test.loc[test["summary"].str.len() >= 5]
val=val.loc[val["summary"].str.len() >= 5]
train=train.loc[train["document"].str.len() >= 100]
test=test.loc[test["document"].str.len() >=40]
val=val.loc[val["document"].str.len() >= 40]

In [20]:
task_prefix= "summarize: "

In [21]:
train['document'] = task_prefix + train['document'] 
test['document'] = task_prefix + test['document']
val['document'] = task_prefix + val['document'] 

In [None]:
# replace /n and all / after model runs

In [None]:
# train["summary"][2083]

# T5

In [21]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

In [22]:
def tokenize_corpus(df, tokenizer, max_len):
    input_ids = []
    attention_masks = []
    max_len = max_len
    for doc in df:
        encoded_dict = tokenizer.encode_plus(
                            doc,  
                            add_special_tokens=True,  
                            max_length=max_len, 
                            truncation=True, 
                            pad_to_max_length=True, 
                            return_attention_mask=True,  
                            return_tensors='pt' 
                       )

        input_ids.append(encoded_dict['input_ids'])

        attention_masks.append(encoded_dict['attention_mask'])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

In [23]:
train_doc_input_ids, train_doc_attention_masks = tokenize_corpus(train['document'].values, tokenizer, 512)
test_doc_input_ids, test_doc_attention_masks = tokenize_corpus(test['document'].values, tokenizer, 512)
val_doc_input_ids, val_doc_attention_masks = tokenize_corpus(val['document'].values, tokenizer, 512)



In [24]:
train_summary_input_ids, train_summary_attention_masks = tokenize_corpus(train['summary'].values, tokenizer, 100)
test_summary_input_ids, test_summary_attention_masks = tokenize_corpus(test['summary'].values, tokenizer, 100)
val_summary_input_ids, val_summary_attention_masks = tokenize_corpus(val['summary'].values, tokenizer, 100)



In [25]:
train_df=TensorDataset(train_doc_input_ids, train_doc_attention_masks,train_summary_input_ids, train_summary_attention_masks)
test_df=TensorDataset(test_doc_input_ids, test_doc_attention_masks,test_summary_input_ids, test_summary_attention_masks)
val_df=TensorDataset(val_doc_input_ids, val_doc_attention_masks,val_summary_input_ids, val_summary_attention_masks)

In [26]:
train_dataloader = DataLoader(dataset=train_df,shuffle=True,batch_size=16)
valid_dataloader = DataLoader(dataset=val_df,shuffle=False,batch_size=16)
test_dataloader = DataLoader(dataset=test_df,shuffle=False,batch_size=16)

In [27]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

In [28]:
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dro

In [None]:
next(model.parameters()).is_cuda

True

In [29]:
optimizer = AdamW(model.parameters(),lr = 3e-4)

In [30]:
epochs = 2
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
def train(model, dataloader, optimizer):
    print("-----------------------------")
    print(' Epoch {:}'.format(epoch + 1))

    train_loss = 0

    model.train()

    for step, batch in enumerate(dataloader):
        optimizer.zero_grad()
        with autocast():
            outputs = model(input_ids=batch[0].cuda(), attention_mask=batch[1].cuda(), labels=batch[2].cuda(), decoder_attention_mask=batch[3].cuda())
            loss, prediction_scores = outputs[:2]
            train_loss += loss.item()

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

    avg_loss = train_loss / len(dataloader)

    results.append({'Train Loss': avg_loss})

    print("-------------------")
    print("epoch   train loss")
    print(f"{epoch+1:5d}   {avg_loss:.5f}")

    return results


In [None]:
def validating(model, dataloader):
    print("----------------------")
    model.eval()
    val_loss = 0
    for batch in dataloader:
        with torch.no_grad():
            outputs = model(input_ids=batch[0].cuda(), attention_mask=batch[1].cuda(), labels=batch[2].cuda(), decoder_attention_mask=batch[3].cuda())
            loss, prediction_scores = outputs[:2]
            val_loss += loss.item()
    global avg_val_loss
    avg_val_loss = val_loss / len(dataloader)

    val_results.append({'validation loss': avg_val_loss})
    print("----------------------")
    print("epoch   val loss")
    print(f"{epoch+1:5d}   {avg_val_loss:.5f}")

    return val_results

In [31]:
def testing(model, dataloader):
    print("--------------------")
    model.eval()
    test_loss = 0
    preds1 = []
    given = []
    for step, batch in enumerate(dataloader):
        with torch.no_grad():
            outputs = model(input_ids=batch[0].cuda(), attention_mask=batch[1].cuda(), labels=batch[2].cuda(), decoder_attention_mask=batch[3].cuda())
            loss, prediction_scores = outputs[:2]
            test_loss += loss.item()

            batch_input_ids = batch[0].cuda()
            batch_input_mask = batch[1].cuda()
            batch_summary_ids = batch[2].cuda()

            generated_ids = model.generate(
                    input_ids=batch_input_ids,
                    attention_mask=batch_input_mask,
                    do_sample=True,
                    top_k=40,
                    top_p=0.9, 
                    max_length=200, 
                    num_beams=4, 
                    early_stopping=True, 
                    no_repeat_ngram_size=3,
                    num_return_sequences=1,
                    temperature=0.7
                    )

            preds2 = [tokenizer.decode(g_id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g_id in generated_ids]
            target = [tokenizer.decode(b_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)for b_id in batch_summary_ids]
            preds1.extend(preds2)
            given.extend(target)

    avg_test_loss = test_loss / len(dataloader)
    test_results.append({'Test Loss': avg_test_loss})
    global df2
    df2_data = pd.DataFrame({'predicted summary': preds1, 'summary': given})
    df2 = df2.append(df2_data)

    return test_results

In [32]:
scaler = GradScaler()

In [33]:
results = []
val_results = []
best_valid_loss = float('inf')

In [None]:
for epoch in range(epochs):
    train(model, train_dataloader, optimizer)
    validating(model, valid_dataloader)
    if val_results[epoch]['validation loss'] < best_valid_loss:
        best_valid_loss = val_results[epoch]['validation loss']
        torch.save(model.state_dict(), 'tanvi-hw3.pt') 

-----------------------------
 Epoch 1
-------------------
epoch   train loss
    1   0.79752
----------------------
----------------------
epoch   val loss
    1   0.69653
-----------------------------
 Epoch 2
-------------------
epoch   train loss
    2   0.73631
----------------------
----------------------
epoch   val loss
    2   0.67863


In [34]:
df2 = pd.DataFrame({'predicted summary': [], 'summary': []})
test_results = []
model.load_state_dict(torch.load('tanvi-hw3.pt'))
testing(model, test_dataloader)

--------------------


  next_indices = next_tokens // vocab_size


[{'Test Loss': 0.70986153780153}]

In [57]:
df2["predicted summary"][866]

'A man has been charged with murder after the bodies of two men were found in Dundee.'

In [58]:
df2["summary"][866]

'A 37-year-old man has been charged in connection with the deaths of a man and woman at a Dundee flat.'

In [44]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [45]:
from rouge import Rouge 

In [46]:
rouge = Rouge()

In [48]:
rouge.get_scores( df2["predicted summary"].to_list(), df2["summary"].to_list(),avg=True)

{'rouge-1': {'f': 0.2826572409038569,
  'p': 0.32043419604278683,
  'r': 0.26188185145327736},
 'rouge-2': {'f': 0.08640420556106883,
  'p': 0.09795060114085956,
  'r': 0.08043462810621867},
 'rouge-l': {'f': 0.23019530243034508,
  'p': 0.2609580362292668,
  'r': 0.2133451160549097}}