# About

Computing rouge metrics by using the baseline BART model without any fine tuning on our test dataset. 

# Setups

In [2]:
from IPython.display import clear_output

!pip install datasets transformers rouge_score rouge-score nltk
# rouge-score is the google version
!pip install pyarrow
!pip install -q sentencepiece

clear_output()

In [4]:
import os
import re
import time
from tqdm import tqdm
import pandas as pd
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt

# nlp stuff
import nltk
nltk.download('punkt')

# tf stuff
import tensorflow_datasets as tfds 
import tensorflow as tf
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration # pegasus
from transformers import BartTokenizer, TFBartForConditionalGeneration # bart

# pytorch dataset types
import datasets
from datasets.dataset_dict import DatasetDict
from datasets import Dataset, load_metric, load_dataset

# pytorch bart stuff
import torch
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer

clear_output()

In [5]:
# sign into huggingface
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [6]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.3.4-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


# Load data

In [7]:
# specify your path to the repo here:
repo_path = '/content/gdrive/MyDrive/w266/w266_reddit_summarization'

In [8]:
%%time
from google.colab import drive
drive.mount('/content/gdrive')
data_path = os.path.join(repo_path, 'data/reddit_parquet/train_test_split')
os.chdir(data_path)
files = [i for i in os.listdir(data_path) if re.search("reddit", i)]

train = pd.read_parquet('reddit_train.parquet')
test = pd.read_parquet('reddit_test.parquet')
valid = pd.read_parquet('reddit_validation.parquet')

Mounted at /content/gdrive
CPU times: user 948 ms, sys: 353 ms, total: 1.3 s
Wall time: 18.5 s


In [9]:
train.head(3)

Unnamed: 0,content,summary,subreddit,subreddit_group
779138,My dad got sick of the neighbors dog and went ...,neighbor married a piece of shit wife and she ...,AskReddit,advice/story
405453,"Is this the case? No, not entirely. First of a...",Most Christians in the United States don't bel...,TrueAtheism,media/lifestyle/sports
2422458,So after listening to [this]( nonstop I've dec...,"Could anyone with experience in choppy, call-a...",edmproduction,other


# Modeling

In [56]:
# bunch of diff checkpoints to consider

# bart checkpoints
# model_checkpoint = 'facebook/bart-base' # keep returning the first sentence for me, extractive.
# model_checkpoint = 'facebook/bart-large-mnli' # same as above, only returns first sentences. extractive.
# model_checkpoint = 'sshleifer/distilbart-cnn-12-6' # works a bit better, but seems to produce extractive summaries still. 
# model_checkpoint = 'sshleifer/distilbart-xsum-6-6' # was recommended. produces abstractive summaries p well. so far works the best of the above. 
model_checkpoint = 'sshleifer/distilbart-xsum-6-6'

# pegasus checkpoints:
# model_checkpoint = "google/pegasus-xsum" # works really well
# model_checkpoint = 'google/pegasus-reddit_tifu' # also works really well

In [57]:
# load model, tokenizer, and rouge metric
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
metric = load_metric("rouge")

clear_output()

In [58]:
# convert data to torch Dataset
raw_datasets = DatasetDict({
    'train': Dataset.from_dict({
        'content': train['content'],
        'summary': train['summary'],
        'subreddit': train['subreddit'],
        'subreddit_group': train['subreddit_group']
    }), 

    'test': Dataset.from_dict({
        'content': test['content'],
        'summary': test['summary'],
        'subreddit': test['subreddit'],
        'subreddit_group': test['subreddit_group']
    }), 

    'valid': Dataset.from_dict({
        'content': valid['content'],
        'summary': valid['summary'],
        'subreddit': valid['subreddit'],
        'subreddit_group': valid['subreddit_group']
    })
})

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['content', 'summary', 'subreddit', 'subreddit_group'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['content', 'summary', 'subreddit', 'subreddit_group'],
        num_rows: 5000
    })
    valid: Dataset({
        features: ['content', 'summary', 'subreddit', 'subreddit_group'],
        num_rows: 5000
    })
})

In [59]:
# tokenize everything
max_input_length = 1024
max_target_length = 128

def preprocess_function(examples):
    inputs = [doc for doc in examples["content"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

In [60]:
args = Seq2SeqTrainingArguments(
    f"BART-reddit-baseline",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4, # 16
    per_device_eval_batch_size=4, #16
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

In [61]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [62]:
%%time
# run this to train, which we won't do at the moment
# trainer = Seq2SeqTrainer(
#     model,
#     args,
#     train_dataset=tokenized_datasets["train"],
#     eval_dataset=tokenized_datasets["valid"],
#     data_collator=data_collator,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics
# )

# trainer.train()

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs


In [63]:
# optional to save to huggingface
#trainer.push_to_hub()

In [64]:
# then load model back in
# model = AutoModelForSeq2SeqLM.from_pretrained("trevorj/model_name")

In [65]:
%%time
ind = 1

# generate one prediction
output = model.generate(
    torch.tensor([tokenized_datasets['test']['input_ids'][ind]]),
    num_beams=2, 
    # length_penalty=0.001, # doesn't seem to do anything
    max_length=60,
    min_length=2,
    no_repeat_ngram_size=3
)

print("Input text:")
pprint(tokenized_datasets['test']['content'][ind])

print("\nTrue summary:")
pprint(tokenized_datasets['test']['summary'][ind])

summary1 = tokenizer.decode(output.squeeze(), skip_special_tokens=True)
print(f"\nPredicted summary (n words = {len(summary1.split(' '))}):")
pprint(summary1)

print('\nRouge metrics')
rouge_metrics_summary1 = metric.compute(predictions=[summary1], references=[tokenized_datasets['test']['summary'][ind]])
pprint(rouge_metrics_summary1)

Input text:
("I'm currently employed by a temp service that's contracted with a local "
 'factory which happens to rhyme with one of the places we nuked in WW2. I '
 'wish to find something in a different field (I would rather do telemarketing '
 "than this), but I'm at a loss on how to phrase that I'd be leaving my "
 'current job because I simply can\'t handle the whole "lift and move a 20-30 '
 'pound piece of metal 3 feet every 10 seconds for eight hours" part. I '
 "don't  hate  my job, but I feel my talents (I used to do freelance tech "
 "support before I had to move) are going to waste, the job isn't "
 'intellectually stimulating or challenging, and the whole "Oh god the pain '
 'please kill me now" is starting to get out of hand. \n'
 " How do I phrase this without sounding like a whiny, entitled brat? I don't "
 'think the work is "below me", etc., but it\'s certainly not a job I see '
 'myself at in five years.')

True summary:
'Job is painful and unfulfilling, how to phras

Making preds w/ this bart model takes aboout 1.75 min for 20. Or ~5.5 sec per obs. Expect to take about 7.5 hrs to predict on 5k obs. Started at 11:04 am.
- Ended up taking 4:58 hrs (~5 hrs) on 5k obs

In [102]:
%%time
# batch predict and write to disk
def model_predict(model, input_ids):
  output = model.generate(torch.tensor([input_ids]), num_beams=2, max_length=60, min_length=2, no_repeat_ngram_size=3)
  output_decoded = tokenizer.decode(output.squeeze(), skip_special_tokens=True)
  return output_decoded

df_results = pd.DataFrame({
    'content': tokenized_datasets['test']['content'],
    'y': tokenized_datasets['test']['summary'],
    'input_ids': tokenized_datasets['test']['input_ids']
})

# df_results = pd.DataFrame({
#     'content': tokenized_datasets['test'].select(range(20))['content'],
#     'y': tokenized_datasets['test'].select(range(20))['summary'],
#     'input_ids': tokenized_datasets['test'].select(range(20))['input_ids']
# })

df_results['yhat'] = df_results['input_ids'].map(lambda x: model_predict(model, input_ids=x))
df_results = df_results[['content', 'y', 'yhat']]

CPU times: user 4h 58min 22s, sys: 1min 39s, total: 5h 2s
Wall time: 4h 58min 52s


In [103]:
# maybe learn parallelizing this later, since we have 2 cores.

In [104]:
%%time
# write results to disk
out_path ="/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_baseline_1/"
f1 = os.path.join(out_path, "bart_baseline_preds.parquet")
df_results.to_parquet(f1)

CPU times: user 43 ms, sys: 15 ms, total: 58 ms
Wall time: 75.5 ms


In [105]:
# read back in and calc results
df_results_final = pd.read_parquet(f1)
df_results_final

Unnamed: 0,content,y,yhat
0,As the title says. I'm looking to fly some PvP...,What are some good Caldari ships right now for...,I'm a fan of the World of Warcraft video game...
1,I'm currently employed by a temp service that'...,"Job is painful and unfulfilling, how to phrase...",I'm a former soldier who has been working in ...
2,What? No. A few very large banks started giv...,bad banking policy and a lack of governmental ...,The BBC News website looks at what happened t...
3,I'm pretty sure they just look at IDs and give...,I'm pretty sure you can get in. If anybody as...,The University of South Africa (GSA) is hosti...
4,Computer Science is one of the worst departmen...,brighter days ahead for the CS department. The...,I'm a computer science student at the Univers...
...,...,...,...
4995,I was driving down fairly empty 55mph side str...,I crashed into a giant black guy's pick up tru...,A young girl in the US state of New Jersey ha...
4996,"Alright, that comment makes much more sense....","for ""Fahrenheit 451"", then try to discuss it w...",I've been talking to one of the people who ha...
4997,"My friend, Lance, just came back from a week-l...",friend wants to relax after a long fishing tri...,It's a bit of a strange situation when you're...
4998,"I will offer you the following advice, as prov...","Practice, network, audition, and be patient. I...","I'm a classical music teacher, and I want to ..."


In [108]:
# rouge metrics:
# rouge_metrics_summary1 = metric.compute(predictions=[summary1], references=[tokenized_datasets['test']['summary'][ind]])
# pprint(rouge_metrics_summary1)
test_metrics = metric.compute(predictions=df_results_final['yhat'].tolist(), references=df_results_final['y'].tolist())
test_metrics

{'rouge1': AggregateScore(low=Score(precision=0.1607160671404885, recall=0.14929780341232818, fmeasure=0.13654078232765968), mid=Score(precision=0.16431575983543217, recall=0.15269417915827405, fmeasure=0.13914921052413262), high=Score(precision=0.16819449958926114, recall=0.155943663272228, fmeasure=0.14193615212245322)),
 'rouge2': AggregateScore(low=Score(precision=0.022066451198676323, recall=0.019934475305264352, fmeasure=0.01808121882864185), mid=Score(precision=0.023321365638406793, recall=0.02138406420792303, fmeasure=0.01913616881813648), high=Score(precision=0.024562110552903225, recall=0.022895696580133663, fmeasure=0.020152761615947534)),
 'rougeL': AggregateScore(low=Score(precision=0.12149415852538717, recall=0.11930613456063449, fmeasure=0.1056675680998527), mid=Score(precision=0.1242764608890442, recall=0.1218382072367182, fmeasure=0.10752325378402133), high=Score(precision=0.12685297292633488, recall=0.12452328871002091, fmeasure=0.10942146930574335)),
 'rougeLsum': Ag