In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from rouge import Rouge
import torch

Preprocess the data

In [2]:
wikihowAll_data = pd.read_csv("wikihowAll.csv")
print("The original dataset's size is", wikihowAll_data.shape[0])

The original dataset's size is 215365


In [None]:
# Remove all the NAN value in the dataset
wikihowAll_drop_NA = wikihowAll_data.dropna(subset=['headline', 'title', 'text'])
wikihowAll_drop_NA = wikihowAll_drop_NA.reset_index(drop=True)

# Remove the dupicate data
wikihowAll_data_clean = wikihowAll_drop_NA.drop_duplicates()
wikihowAll_data_clean = wikihowAll_drop_NA.reset_index(drop=True)

# Remove the short text
index = []
for i in range(len(wikihowAll_data_clean['text'])):
    if len(nltk.sent_tokenize(wikihowAll_data_clean['text'][i])) < 3:
        index.append(i)
wikihowAll_data_clean = wikihowAll_data_clean.drop(index)

print("After clean, the dataset's size is", wikihowAll_data_clean.shape[0])

In [None]:
# split the data to train, dev and test part for model training
num_rows = wikihowAll_data_clean.shape[0]

train_rows = int(num_rows * 0.70)
dev_rows = int(num_rows * 0.15)
test_rows = num_rows - train_rows - dev_rows

train_data = wikihowAll_data_clean[:train_rows]
dev_data = wikihowAll_data_clean[train_rows:train_rows+dev_rows]
test_data = wikihowAll_data_clean[train_rows+dev_rows:]

train_data = train_data.reset_index(drop=True)
dev_data = dev_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

In [None]:
train_data.to_csv("wikihowAll_train.csv", index=False)
dev_data.to_csv("wikihowAll_dev.csv", index=False)
test_data.to_csv("wikihowAll_test.csv", index=False)

Load the dataset

In [2]:
from datasets import load_dataset

data_files = {'train':['wikihowAll_train.csv'],
        'dev':['wikihowAll_dev.csv'],
        'test':['wikihowAll_test.csv']}

wikihowAll_dataset = load_dataset('csv', data_files=data_files)

Found cached dataset csv (/home/zexian/.cache/huggingface/datasets/csv/default-3865e2b4c57b20b0/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
train_dataset = wikihowAll_dataset['train']
dev_dataset = wikihowAll_dataset['dev']
test_dataset = wikihowAll_dataset['test']

In [4]:
test_dataset

Dataset({
    features: ['headline', 'title', 'text'],
    num_rows: 29849
})

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("Using CPU")

Using GPU: NVIDIA GeForce RTX 3050 Laptop GPU


Load the pre-trained model and test it by using pipeline

In [12]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import pipeline

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
summarizer_pegasus = pipeline("summarization", model=model, tokenizer=tokenizer)

In [37]:
text = test_dataset['text'][0]

In [41]:
test_summarize = summarizer_pegasus(f'summarize: {text}', min_length=16, max_length=150)

In [42]:
test_summarize[0]['summary_text']

'you will need to choose a program which allows you to create "tables" some common computer programs which serve this function great are Microsoft word, Print Shop, and google docs . Depending on how large/small you want your bingo card to be will determine how you shift the table .'

In [43]:
rouge = Rouge()
pipeline_scores = rouge.get_scores(test_summarize[0]['summary_text'], test_dataset['headline'][0], avg=True)
pipeline_scores

{'rouge-1': {'f': 0.17777777280000015,
  'p': 0.16666666666666666,
  'r': 0.19047619047619047},
 'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0},
 'rouge-l': {'f': 0.2058823480968859, 'p': 0.175, 'r': 0.25}}

Load the pretrained model

In [11]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import pipeline
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, AdamW, get_linear_schedule_with_warmup

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Convert the dataset to inputs and labels

In [7]:
def tokenize_t5(examples):
  inputs = [f'summarize: {text}' for text in examples['text']]
  labels = examples["headline"]
  model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
  labels = tokenizer(labels, max_length=150, truncation=True, padding="max_length", return_tensors="pt").input_ids
  model_inputs["labels"] = labels
  return model_inputs

train_dataset = train_dataset.map(tokenize_t5, batched=True)
dev_dataset = dev_dataset.map(tokenize_t5, batched=True)

Loading cached processed dataset at /home/zexian/.cache/huggingface/datasets/csv/default-3865e2b4c57b20b0/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-bf12cb378d5425a1.arrow
Loading cached processed dataset at /home/zexian/.cache/huggingface/datasets/csv/default-3865e2b4c57b20b0/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-a4b790ca3c6d4497.arrow


In [None]:
# this is the part to test how the model performance without prompt prefix, After test, without prefix, T5 can't generate valid summary.
def tokenize_t5_without_prefix(examples):
  inputs = examples['text']
  labels = examples["headline"]
  model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
  labels = tokenizer(labels, max_length=150, truncation=True, padding="max_length", return_tensors="pt").input_ids
  model_inputs["labels"] = labels
  return model_inputs

train_dataset_without_prefix = train_dataset.map(tokenize_t5_without_prefix, batched=True)
dev_dataset_without_prefix = dev_dataset.map(tokenize_t5_without_prefix, batched=True)

In [16]:
len(train_dataset[0]['input_ids'])

512

Define the optimizer and scheduler

In [9]:
def get_optimizer_and_scheduler(model, training_args):
    
    learning_rate = 1e-5
    weight_decay = 0.01

    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": weight_decay},
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

    num_training_steps = training_args.max_steps if training_args.max_steps > 0 else len(train_dataset) // training_args.gradient_accumulation_steps * training_args.num_train_epochs
    num_warmup_steps = int(num_training_steps * training_args.warmup_ratio)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)

    return optimizer, scheduler


Set the training args and trainer

In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./output",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    save_strategy="epoch",
    weight_decay=0.01,
    logging_steps=5,
    save_total_limit=3,
    gradient_accumulation_steps=8,
    fp16=False,
    report_to="none",
)

optimizer, scheduler = get_optimizer_and_scheduler(model, training_args)

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=data_collator,
    optimizers = (optimizer, scheduler)
)



Train the model

In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss


Load in the fine-tuned model

In [14]:
trained_model_name = "./checkpoint-2"
trained_tokenizer = T5Tokenizer.from_pretrained(trained_model_name)
trained_model = T5ForConditionalGeneration.from_pretrained(trained_model_name)

In [None]:
batch_size = 12
trained_summaries = []

for i in range(0, len(test_dataset), batch_size):

    texts = [f'summarize: {text}' for text in test_dataset['text'][i:i + batch_size]]
    inputs = trained_tokenizer(texts, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    summary_ids = trained_model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=150,
        min_length=16,
        num_beams=4,
        early_stopping=True,
    )

    summary_texts = [trained_tokenizer.decode(ids, skip_special_tokens=True) for ids in summary_ids]
    trained_summaries.extend(summary_texts)


In [None]:
rouge = Rouge()
trained_metric_scores = rouge.get_scores(trained_summaries, test_dataset['headline'], avg=True)
trained_metric_scores

Load in the original T5 model to compare with the fine-tuned model

In [29]:
original_tokenizer = T5Tokenizer.from_pretrained("t5-small")
original_model = T5ForConditionalGeneration.from_pretrained("t5-small")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
batch_size = 12
summaries = []

for i in range(0, len(test_dataset), batch_size):
  
    texts = [f'summarize: {text}' for text in test_dataset['text'][i:i + batch_size]]
    inputs = original_tokenizer(texts, return_tensors="pt", max_length=1024, truncation=True, padding="max_length")
    
    summary_ids = original_model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=150,
        min_length=16,
        num_beams=4,
        early_stopping=True,
    )
    
    summary_texts = [original_tokenizer.decode(ids, skip_special_tokens=True) for ids in summary_ids]
    summaries.extend(summary_texts)

In [None]:
rouge = Rouge()
original_scores = rouge.get_scores(summaries, test_dataset['headline'], avg=True)
original_scores

Combined TextRank with T5 model: This part I tried, but didn't get a better performance and I explained it in the Other Things We Tried in the final report

In [None]:
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import numpy as np

stop_words = stopwords.words('english')

In [None]:
glove_file = "glove.6B.50d.txt"
word_embeddings = {}
with open(glove_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.strip().split()
        word = values[0]
        word_embed = np.asarray(values[1:]).astype("float")
        word_embeddings[word] = word_embed

In [None]:
def TextRank_summary(text, num_sentence):
    
    sentences = nltk.sent_tokenize(text)
    
    clean_sentences = []

    for sentence in sentences:
        
        # remove punctuations and numbers
        clean_sentence = re.sub('[^a-zA-Z]', ' ', sentence)
        
        # set all of the numbers to lower case
        clean_sentence = clean_sentence.lower()
        
        # remove the stop words
        clean_sentence_token = nltk.word_tokenize(clean_sentence)
        clean_sentence_token = [word for word in clean_sentence_token if word not in stop_words]
        clean_sentence = " ".join(clean_sentence_token)
        
        clean_sentences.append(clean_sentence)
        
    # Get the sentence
    sentence_vectors = []
    for sentence in clean_sentences:
        if len(sentence) != 0:
            sentence_token = nltk.word_tokenize(sentence)
            sentence_vector =  np.mean([word_embeddings.get(word, np.zeros((50,))) for word in sentence_token], axis=0)
        else:
            sentence_vector = np.zeros((50,))
        sentence_vectors.append(sentence_vector)
    
    similarity_mat = np.zeros((len(clean_sentences), len(clean_sentences)))
    
    for i in range(len(sentence_vectors)):
        for j in range(len(sentence_vectors)):
            if i != j:
                sentence_i = sentence_vectors[i].reshape((1, 50))
                sentence_j = sentence_vectors[j].reshape((1, 50))
                similarity_mat[i][j] = cosine_similarity(sentence_i, sentence_j)[0][0]
    
    sentence_graph = nx.from_numpy_array(similarity_mat)
    sentence_scores = nx.pagerank(sentence_graph, max_iter=500, tol=1e-5, nstart={node: 1.0 for node in sentence_graph.nodes()})
    
    ranked_sentences = []
    for i, sentence in enumerate(sentences):
        ranked_sentences.append((sentence_scores[i], sentence))
    ranked_sentences = sorted(ranked_sentences, reverse=True)
    
    sentences = []
    if len(ranked_sentences) < num_sentence:
        num_sentence = len(ranked_sentences) - 1
    for i in range(num_sentence):
        sentences.append(ranked_sentences[i][1])
    summary = " ".join(sentences)
    
    return summary

In [None]:
# this is because the average sentence length in WikiHow Dataset is 100.68
num_sentence = 5

In [None]:
def tokenize_t5_textRank(examples):
  summaries = [TextRank_summary(text, num_sentence) for text in examples['text']]
  inputs = [f'summarize: {text}' for text in summaries]
  labels = examples["headline"]
  model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
  labels = tokenizer(labels, max_length=150, truncation=True, padding="max_length", return_tensors="pt").input_ids
  model_inputs["labels"] = labels
  return model_inputs

train_dataset_textRank = train_dataset.map(tokenize_t5_textRank, batched=True)
dev_dataset_textRank = dev_dataset.map(tokenize_t5_textRank, batched=True)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset_textRank,
    eval_dataset=dev_dataset_textRank,
    data_collator=data_collator,
    optimizers = (optimizer, scheduler)
)
trainer.train()

In [None]:
tokenizer = T5Tokenizer.from_pretrained("./output/checkpoint-8704")
model = T5ForConditionalGeneration.from_pretrained("./output/checkpoint-8704")

In [None]:
batch_size = 12
summaries = []

for i in range(0, len(test_dataset), batch_size):
  
    texts = [f'summarize: {text}' for text in test_dataset['text'][i:i + batch_size]]
    inputs = tokenizer(texts, return_tensors="pt", max_length=1024, truncation=True, padding="max_length")
    
    summary_ids = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=150,
        min_length=16,
        num_beams=4,
        early_stopping=True,
    )
    
    summary_texts = [original_tokenizer.decode(ids, skip_special_tokens=True) for ids in summary_ids]
    summaries.extend(summary_texts)

In [None]:
rouge = Rouge()
scores = rouge.get_scores(summaries, test_dataset['headline'], avg=True)
scores