# Fine-Tuning T5 for Summarization Task
This notebook contains the code for fine-tuning T5 model using the LIAR-PLUS dataset. The final model is part of the Justification Generator module in the Fake News Detection Framework

The second part of the notebook contains evaluation for Veracity Explanation approach 1: Text Summarization and approach 2: Text Generation

In [None]:
import os
import torch
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
torch.cuda.current_device()

In [None]:
!pip3 install rouge-score

In [None]:
# Use this code when on GPU
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=7

In [1]:
import numpy as np
import pandas as pd

In [None]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import sklearn as sk
from rouge_score import rouge_scorer

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
#BartTokenizer, BartForConditionalGeneration

In [None]:
torch.cuda.empty_cache()

In [None]:
# Creating a custom dataset for reading the dataframe and loading it into the dataloader to pass it t
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.text
        self.ctext = self.data.ctext

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')
      
        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [None]:
# Training function. The model is in train mode and then we enumerate over the training loader
#and passed to the defined network 

def train(epoch, tokenizer, model, device, loader, optimizer):
    
    model.train()
    
    total_loss, total_accuracy = 0, 0
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, lm_labels=lm_labels)
        loss = outputs[0]
        
        total_loss = total_loss + loss.item()
        
       
        if _%500==0:
            print(f'Epoch: {epoch+1}, Loss:  {loss.item()}')
               
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        
    avg_loss = total_loss / len(loader)
    print(avg_loss)
    
    return avg_loss

In [None]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')
            
            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [None]:
TRAIN_BATCH_SIZE = 2    # input batch size for training (default: 64)
VALID_BATCH_SIZE = 2    # input batch size for testing (default: 1000)
TRAIN_EPOCHS = 30        # number of epochs to train (default: 10)
VAL_EPOCHS =  4
LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
SEED = 42               # random seed (default: 42)
MAX_LEN = 512
SUMMARY_LEN = 150 

In [None]:
torch.manual_seed(SEED) # pytorch random seed
np.random.seed(SEED) # numpy random seed
torch.backends.cudnn.deterministic = True

In [None]:
# tokenzier for encoding the text
tokenizer = T5Tokenizer.from_pretrained("t5-base")

In [2]:
#import Liar dataset (use the train+val.csv)
df = pd.read_csv('Datasets/ds_liar_train_val.tsv',encoding='latin-1', sep='\t')
df.columns = ['label','text','ctext']
df.drop('label', inplace=True, axis=1)
#df = df[['label','text','ctext']]
df.ctext = 'summarize: ' + df.ctext
df.head()

Unnamed: 0,text,ctext
0,when did the decline of coal start? it started...,"summarize: surovell said the decline of coal ""..."
1,"hillary clinton agrees with john mccain ""by vo...",summarize: obama said he would have voted agai...
2,health care reform legislation is likely to ma...,summarize: the release may have a point that m...
3,the economic turnaround started at the end of ...,"summarize: crist said that the economic ""turna..."
4,the chicago bears have had more starting quart...,summarize: but vos specifically used the word ...


In [None]:
# Creation of Dataset and Dataloader
# Defining the train size. So 80% of the data will be used for training and the rest will be used for validation. 
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state = SEED)
val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

In [None]:
print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("VAL Dataset: {}".format(val_dataset.shape))

In [None]:
# Creating the Training and Validation dataset for further creation of Dataloader
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)
val_set = CustomDataset(val_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)

In [None]:
# Defining the parameters for creation of dataloaders
train_params = {
    'batch_size': TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
    }

val_params = {
    'batch_size': VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
    }

In [None]:
# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)

In [None]:
# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = T5ForConditionalGeneration.from_pretrained("t5-base")
device = torch.device('cuda')
model = model.to(device)

In [None]:
# Defining the optimizer that will be used to tune the weights of the network in the training session. 
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
print('Initiating Fine-Tuning for the model on LIAR-PLUS dataset')

In [None]:
for epoch in range(TRAIN_EPOCHS):
    print('\n Epoch {:} / {:}'.format(epoch + 1, TRAIN_EPOCHS))
    avg_loss = train(epoch, tokenizer, model, device, training_loader, optimizer)

In [None]:
prefix = 'TrainedModels/T5Summary_ds_weights'

In [None]:
#Save the model
torch.save(model.state_dict(), f"{prefix}_{TRAIN_EPOCHS}_lr-{str(LEARNING_RATE).replace('-','')}.pt")

In [None]:
print("Training Loss: " + str(avg_loss))

In [None]:
# Validation loop and saving the resulting file with predictions and acutals in a dataframe.
# Saving the dataframe as predictions.csv
print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
for epoch in range(VAL_EPOCHS):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
    final_df.to_csv('predictions.csv')
    print('Output Files generated for review')

### Summarize Articles Text using the Fine-Tunef T5 Model

In [None]:
#Load pre-trained model and generate summary
path = "TrainedModels/T5NewsSummary_ds_weights_30_lr-0.0001.pt"
model.load_state_dict(torch.load(path))

In [None]:
model.state_dict()

In [None]:
#Load the df_test
# which is the df_crawled articles in our pipeline
column_names = ["Keyword", "Crawled Article Title", "Crawled Article Text", "Crawled Article Link", "Crawled Article Summary", "Crawled Article Keywords"]
df_test = pd.read_csv('Datasets/crawled_articles.tsv',sep='\t')
df_test.columns=column_names
df_test.head()

In [None]:
df = df_test[['Crawled Article Title','Crawled Article Text']]
df.columns= ['text','ctext']
df.ctext = 'summarize: ' + df.ctext
df.head()

In [None]:
df_test = df[df.text != 'No Title']
df_test.reset_index(drop=True, inplace=True)
df_test

In [None]:
#Create test dataset
test_dataset=df_test.reset_index(drop=True)

In [None]:
test_dataset

In [None]:
#Create Test Set
test_set = CustomDataset(test_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)

In [None]:
#create test dataloader
test_params = {
    'batch_size': VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
    }

In [None]:
test_loader = DataLoader(test_set, **test_params)

In [None]:
test_loader

In [None]:
def generate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    rscores = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')
            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [None]:
TEST_EPOCHS = 1

In [None]:
summaries_df = {}
print('Now generating summaries on our fine tuned model for the test dataset and saving it in a dataframe')
for epoch in range(TEST_EPOCHS):
    print("Generating Summaries")
    generated_text, actual_text = generate(epoch, tokenizer, model, device, test_loader)
    summaries_df = pd.DataFrame({'Generated Text':generated_text,'Actual Text':actual_text})
    #final_df.to_csv('predictions.csv')
    print("Summaries generated")

In [None]:
summaries_df

In [None]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL', 'rougeLsum'], use_stemmer=True)

In [None]:
for i,text in enumerate(summaries_df['Actual Text']):
    actual = summaries_df['Actual Text'][i]
    generated = summaries_df['Generated Text'][i]
    score = rscore = scorer.score(str(actual),
                      str(generated))
    print(score['rouge1'])
    print(score['rougeL'])
    print(score['rougeLsum'])
    print("\n")

# Generate Summaries with pre-trained T5

In [None]:
pre_trained_model = T5ForConditionalGeneration.from_pretrained("t5-base")
device = torch.device('cuda')
pre_trained_model = model.to(device)

In [None]:
psummaries_df = {}
print('Now generating summaries on our pre tuned model for the test dataset and saving it in a dataframe')
for epoch in range(TEST_EPOCHS):
    print("Generating Summaries")
    generated_text, actual_text = generate(epoch, tokenizer, pre_trained_model, device, test_loader)
    psummaries_df = pd.DataFrame({'Generated Text':generated_text,'Actual Text':actual_text})
    #final_df.to_csv('predictions.csv')
    print("Summaries generated")

In [None]:
for i,text in enumerate(psummaries_df['Actual Text']):
    actual = psummaries_df['Actual Text'][i]
    generated = psummaries_df['Generated Text'][i]
    pscore = scorer.score(str(actual), str(generated))
    print(pscore['rouge1'])
    print(pscore['rougeL'])
    print(pscore['rougeLsum'])
    print("\n")

In [None]:
psummaries_df['Generated Text'][0]

In [None]:
summaries_df['Generated Text'][0]

In [None]:
final_df1.to_csv('PipelineOutputs/T5_Summarization_output.csv')

In [None]:
final_df1

In [None]:
final_df1['Generated Text'][2]

In [None]:
final_df1['Actual Text'][2]

# Summarization Approach Evalutation

#### Dataset is LIAR-PLUS test set (golden dataset)

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import sklearn as sk
from rouge_score import rouge_scorer

In [None]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL', 'rougeLsum'], use_stemmer=True)

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")

In [None]:
import os
import torch
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
torch.cuda.current_device()
device = torch.device('cuda')

In [None]:
TEST_EPOCHS = 1
TRAIN_BATCH_SIZE = 2    # input batch size for training (default: 64)
VALID_BATCH_SIZE = 2    # input batch size for testing (default: 1000)
TRAIN_EPOCHS = 30        # number of epochs to train (default: 10)
VAL_EPOCHS =  4
LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
SEED = 42               # random seed (default: 42)
MAX_LEN = 512
SUMMARY_LEN = 150 

In [None]:
#import Liar dataset (test.csv)
df = pd.read_csv('Datasets/ds_liar_test.tsv',encoding='latin-1', sep='\t')
df.columns = ['label','text','ctext']
df.drop('label', inplace=True, axis=1)
#df = df[['label','text','ctext']]
df.ctext = 'summarize: ' + df.ctext
df.head()

In [None]:
len(df.ctext)

In [None]:
test_dataset=df.reset_index(drop=True)
print(len(test_dataset))

## Dataset preparation and generate functions

In [None]:
# Creating a custom dataset for reading the dataframe and loading it into the dataloader to pass it to the neural network at a later stage for finetuning the model and to prepare it for predictions

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.text
        self.ctext = self.data.ctext

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')
      
        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [None]:
#Create Test Set
test_set = CustomDataset(test_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)

#create test dataloader
test_params = {
    'batch_size': VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
    }

test_loader = DataLoader(test_set, **test_params)

In [None]:
def generate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    rscores = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')
                
            
            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [None]:
def get_rouge_score(summaries_df):
    rougescore_df = pd.DataFrame()
    for i,text in enumerate(summaries_df['Actual Text']):
        actual = summaries_df['Actual Text'][i]
        generated = summaries_df['Generated Text'][i]
        rscore = scorer.score(str(actual), str(generated))
       
        rougescore_df = rougescore_df.append({'Rouge1': rscore['rouge1'][2], 'RougeL': rscore['rougeL'][2], 'RougeLSum':rscore['rougeLsum'][2]}, ignore_index = True)
    return rougescore_df

## Pre-trained T5 Model

In [None]:
pre_trained_T5_model = T5ForConditionalGeneration.from_pretrained("t5-base")
device = torch.device('cuda')
pre_trained_T5_model = pre_trained_T5_model.to(device)

In [None]:
torch.cuda.empty_cache()

In [None]:
pretrainedT5_liar_summaries_df = {}
print('Now generating summaries on our pretrained model for the test dataset and saving it in a dataframe')
for epoch in range(TEST_EPOCHS):
    print("Generating Summaries")
    generated_text, actual_text = generate(epoch, tokenizer, pre_trained_T5_model, device, test_loader)
    pretrainedT5_liar_summaries_df = pd.DataFrame({'Generated Text':generated_text,'Actual Text':actual_text})
    #final_df.to_csv('predictions.csv')
    print("Summaries generated")

In [None]:
print(len(pretrainedT5_liar_summaries_df['Actual Text']))

In [None]:
pretrainedT5_liar_summaries_df['Actual Text'][429]

In [None]:
pretrainedT5_liar_summaries_df['Generated Text'][429]

In [None]:
pretrainedT5_RougeScore_df = get_rouge_score(pretrainedT5_liar_summaries_df, )

In [None]:
pretrainedT5_RougeScore_df

In [None]:
pretrainedT5_RougeScore_df['Rouge1'].nlargest(5)

In [None]:
pretrainedT5_RougeScore_df['RougeL'].nlargest(5)

In [None]:
pretrainedT5_RougeScore_df['RougeLSum'].nlargest(5)

## Fine-Tuned T5 Model

In [None]:
fine_tuned_T5_model = T5ForConditionalGeneration.from_pretrained("t5-base")
fine_tuned_T5_model = fine_tuned_T5_model.to(device)

In [None]:
torch.cuda.empty_cache()

In [None]:
#Load fine-tuned T5 model
path = "TrainedModels/T5NewsSummary_ds_weights_30_lr-0.0001.pt"
fine_tuned_T5_model.load_state_dict(torch.load(path))

In [None]:
torch.cuda.empty_cache()

In [None]:
finetunedT5_liar_summaries_df = {}
print('FineTuned T5 Model')
for epoch in range(TEST_EPOCHS):
    print("Generating Summaries")
    generated_text, actual_text = generate(epoch, tokenizer, fine_tuned_T5_model, device, test_loader)
    finetunedT5_liar_summaries_df = pd.DataFrame({'Generated Text':generated_text,'Actual Text':actual_text})
    #final_df.to_csv('predictions.csv')
    print("Summaries generated")

In [None]:
print(len(finetunedT5_liar_summaries_df['Actual Text']))

In [None]:
finetunedT5_RougeScore_df = get_rouge_score(finetunedT5_liar_summaries_df)

In [None]:
finetunedT5_RougeScore_df

In [None]:
finetunedT5_RougeScore_df['Rouge1'].nlargest(25)

In [None]:
finetunedT5_RougeScore_df['RougeL'].nlargest(25)

In [None]:
finetunedT5_RougeScore_df['RougeLSum'].nlargest(25)

In [None]:
finetunedT5_liar_summaries_df['Actual Text'][605]

In [None]:
finetunedT5_liar_summaries_df['Generated Text'][605]

## Pre-Trained GPT2 Model

In [None]:
from transformers import pipeline, set_seed

In [None]:
# text Generate using pre-trained GPT2
generator = pipeline('text-generation', model='gpt2')
set_seed(42)

In [None]:
InputClaimFromUser = df.text[429] #text from LIAR

In [None]:
InputClaimFromUser

In [None]:
generated_justification = generator(InputClaimFromUser, max_length=150, num_return_sequences=1)

In [None]:
generated_justification[0]['generated_text']

In [None]:
generated_text = generated_justification[0]['generated_text']
actual_text = df.ctext[429]#ctext from LIAR

In [None]:
actual_text

In [None]:
pretrainedGPT2_RougeScore_df = scorer.score(str(actual_text), str(generated_text))

In [None]:
pretrainedGPT2_RougeScore_df

## Fine-Tuned GPT2 Model

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config

In [None]:
output_dir = "TrainedModels/GPT/"

In [None]:
#Load a trained model and vocabulary that you have fine-tuned
GPT_finetuned_model = GPT2LMHeadModel.from_pretrained(output_dir)
GPT_finetuned_tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
GPT_finetuned_model.to(device)

In [None]:
GPT_finetuned_model.eval()

In [None]:
prompt = "<|startoftext|>"

In [None]:
InputClaimFromUser = df.text[429]

In [None]:
generated = torch.tensor(GPT_finetuned_tokenizer.encode(InputClaimFromUser)).unsqueeze(0)
generated = generated.to(device)

In [None]:
sample_outputs = GPT_finetuned_model.generate(
                                generated, 
                                #bos_token_id=random.randint(1,30000),
                                do_sample=True,   
                                top_k=50, 
                                max_length = 300,
                                top_p=0.95, 
                                num_return_sequences=3
                                )

In [None]:
actual_text = df.ctext[429]#ctext from LIAR

In [None]:
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, GPT_finetuned_tokenizer.decode(sample_output, skip_special_tokens=True)))
    generated_text = GPT_finetuned_tokenizer.decode(sample_output, skip_special_tokens=True)
    score = scorer.score(str(actual_text), str(generated_text))
    print(str(i) + ": Rouge Score: " + str(score) + "\n" )