# About

Computing rouge metrics by using the baseline BART model without any fine tuning on our test dataset. 

# Setups

In [2]:
from IPython.display import clear_output

!pip install datasets transformers rouge_score rouge-score nltk
# rouge-score is the google version
!pip install pyarrow
!pip install -q sentencepiece

clear_output()

In [3]:
import os
import re
import time
from tqdm import tqdm
import pandas as pd
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt

# nlp stuff
import nltk
nltk.download('punkt')

# tf stuff
import tensorflow_datasets as tfds 
import tensorflow as tf
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration # pegasus
from transformers import BartTokenizer, TFBartForConditionalGeneration # bart

# pytorch dataset types
import datasets
from datasets.dataset_dict import DatasetDict
from datasets import Dataset, load_metric, load_dataset

# pytorch bart stuff
import torch
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer

clear_output()

In [4]:
# sign into huggingface
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [5]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.3.4-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


# Load data

In [6]:
# specify your path to the repo here:
repo_path = '/content/gdrive/MyDrive/w266/w266_reddit_summarization'

In [7]:
%%time
from google.colab import drive
drive.mount('/content/gdrive')
data_path = os.path.join(repo_path, 'data/reddit_parquet/train_test_split')
os.chdir(data_path)
files = [i for i in os.listdir(data_path) if re.search("reddit", i)]

train = pd.read_parquet('reddit_train.parquet')
test = pd.read_parquet('reddit_test.parquet')
valid = pd.read_parquet('reddit_validation.parquet')

Mounted at /content/gdrive
CPU times: user 1.41 s, sys: 439 ms, total: 1.85 s
Wall time: 35.5 s


In [8]:
train.head(3)

Unnamed: 0,content,summary,subreddit,subreddit_group
779138,My dad got sick of the neighbors dog and went ...,neighbor married a piece of shit wife and she ...,AskReddit,advice/story
405453,"Is this the case? No, not entirely. First of a...",Most Christians in the United States don't bel...,TrueAtheism,media/lifestyle/sports
2422458,So after listening to [this]( nonstop I've dec...,"Could anyone with experience in choppy, call-a...",edmproduction,other


# Modeling

In [9]:
# load model, tokenizer, and rouge metric
model_checkpoint = 'facebook/bart-base'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
metric = load_metric("rouge")

clear_output()

In [10]:
# convert data to torch Dataset
raw_datasets = DatasetDict({
    'train': Dataset.from_dict({
        'content': train['content'],
        'summary': train['summary'],
        'subreddit': train['subreddit'],
        'subreddit_group': train['subreddit_group']
    }), 

    'test': Dataset.from_dict({
        'content': test['content'],
        'summary': test['summary'],
        'subreddit': test['subreddit'],
        'subreddit_group': test['subreddit_group']
    }), 

    'valid': Dataset.from_dict({
        'content': valid['content'],
        'summary': valid['summary'],
        'subreddit': valid['subreddit'],
        'subreddit_group': valid['subreddit_group']
    })
})

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['content', 'summary', 'subreddit', 'subreddit_group'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['content', 'summary', 'subreddit', 'subreddit_group'],
        num_rows: 5000
    })
    valid: Dataset({
        features: ['content', 'summary', 'subreddit', 'subreddit_group'],
        num_rows: 5000
    })
})

In [11]:
# tokenize everything
max_input_length = 1024
max_target_length = 128

def preprocess_function(examples):
    inputs = [doc for doc in examples["content"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)



  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

In [12]:
args = Seq2SeqTrainingArguments(
    f"BART-reddit-baseline",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4, # 16
    per_device_eval_batch_size=4, #16
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

In [13]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
%%time
# run this to train, which we won't do at the moment
# trainer = Seq2SeqTrainer(
#     model,
#     args,
#     train_dataset=tokenized_datasets["train"],
#     eval_dataset=tokenized_datasets["valid"],
#     data_collator=data_collator,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics
# )

# trainer.train()

In [14]:
# optional to save to huggingface
#trainer.push_to_hub()

In [15]:
# then load model back in
# model = AutoModelForSeq2SeqLM.from_pretrained("trevorj/model_name")

In [70]:
%%time
# generate one prediction
output = model.generate(
    torch.tensor([tokenized_datasets['test']['input_ids'][0]]),
    num_beams=2, 
    # length_penalty=2.0, # doesn't seem to do anything. 
    max_length=100, #200, 
    min_length=2,
    no_repeat_ngram_size=3
)

print("Input text:")
print(tokenized_datasets['test']['content'][0])

print("\nTrue summary:")
print(tokenized_datasets['test']['summary'][0])

summary1 = tokenizer.decode(output.squeeze(), skip_special_tokens=True)
print(f"\nPredicted summary (n words = {len(summary1.split(' '))}):")
print(summary1)

print('\nRouge metrics')
rouge_metrics_summary1 = metric.compute(predictions=[summary1], references=[tokenized_datasets['test']['summary'][0]])
print(rouge_metrics_summary1)

Input text:
As the title says. I'm looking to fly some PvP in the near future. If I want to PvE, I can fit a Tengu or RNI or something. Normally, I'd explore and decide for myself, but my play time is limited these days, so I can't do much more than theory-craft before jumping into the thick of it. 
 I  can  fly other ships, but I greatly prefer Caldari boats. I just came back from a long absence, and everyone seems to be flying vastly different ships from when I left. No idea what to fly anymore, and all of my searches are coming up a year old or older. 
 Used to fly mostly Manticores, Feroxes, Nagas, and Merlins (with the occasional Hawk and Harpy when I felt like losing some ISK.) I can fly any Caldari sub-cap, except Golems (I have no need to train Marauders at the moment.) Losing something like a Tengu would hurt, but it wouldn't be anything I couldn't immediately replace. 
 I just bought one of those new Orthruses and a Jackdaw. Both seem like fun, even though the Orthrus isn't  

In [72]:
torch.tensor([tokenized_datasets['test']['input_ids'][0]]).shape

torch.Size([1, 310])

In [None]:
tokenized_datasets['test'].select(range(5))['input_ids']

In [87]:
torch.tensor(tokenized_datasets['test'].select(range(5))['input_ids'])

ValueError: ignored

In [86]:
torch.tensor([tokenized_datasets['test']['input_ids'][0]]).shape

torch.Size([1, 310])

In [93]:
# read this guide tomorrow to fix issue: https://huggingface.co/course/chapter2/5?fw=pt
# generate 5 summaries
output = model.generate(
    torch.tensor(tokenized_datasets['test'].select(range(5))['input_ids']),
    # tokenized_datasets['test'].select(range(3)),
    num_beams=2, 
    # length_penalty=2.0, # doesn't seem to do anything. 
    max_length=100, #200, 
    min_length=2,
    no_repeat_ngram_size=3
)

ValueError: ignored

In [49]:
# generate all test dataset summaries


In [56]:
model_output_path = os.path.join(repo_path, 'data/model_outputs/bart_baseline_1')
model_output_path

'/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_baseline_1'