# Setups

In [1]:
from IPython.display import clear_output

!pip install datasets transformers rouge_score nltk
!pip install -q sentencepiece # need for pegasus
!pip install rouge-score # google package version

clear_output()

In [2]:
import os
import re
import time
from tqdm import tqdm
import pandas as pd
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import nltk

# pytorch dataset types
import datasets
from datasets.dataset_dict import DatasetDict
from datasets import Dataset, load_metric, load_dataset

import tensorflow as tf
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration # pegasus
from transformers import BartTokenizer, TFBartForConditionalGeneration # bart
from transformers import TFAutoModelForSequenceClassification
# package from google research: https://github.com/google-research/google-research/tree/master/rouge
from rouge_score import rouge_scorer

# pytorch bart stuff
import torch
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer

In [3]:
# torch train says it needs this
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Read data

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')
# data_path ="/content/gdrive/MyDrive/Classes/W266_NLP/w266_reddit_summarization/data/reddit_parquet/"
data_path ="/content/gdrive/MyDrive/w266_reddit_summarization/data/reddit_parquet/"

os.chdir(data_path)
files = [i for i in os.listdir(data_path) if re.search("reddit_data", i)]

Mounted at /content/gdrive


In [5]:
df = pd.read_parquet(files[0])
df = df.iloc[:55000]

In [6]:
train, test = train_test_split(df, test_size=5/55, random_state=1)

print(train.shape)
print(test.shape)

(50000, 3)
(5000, 3)


In [8]:
def group_subreddit(subreddit):

  if subreddit in ['buildapc', 'LifeProTips', 'IAmA', 'DoesAnybodyElse',
                   ] or re.search('advice|ask|relationship|explain|question', subreddit.lower()):
    x = 'advice'
  elif subreddit in ['leagueoflegends', 'DotA2', 'starcraft', 'magicTCG', 
                     'Guildwars2', 'DestinyTheGame', 'pcmasterrace', 
                     'Planetside', 'rpg', 'pokemon', 'smashbros', 'swtor', 
                     'runescape', 'battlefield3', 'DarkSouls2', 'LeagueofLegendsMeta', 
                     'WorldofTanks', 'darksouls', 'gamedev', 'Minecraft', 'Diablo', 
                     'DnD', 'skyrim', 'halo', 'PS4', 'xboxone', 'battlefield_4', 
                     'ShouldIbuythisgame', 'Pathfinder_RPG', 'elderscrollsonline', 
                     'Fallout', 'GrandTheftAutoV'
                     ] or re.search('gaming|games', subreddit.lower()):
    x = 'gaming'
  elif subreddit in ['tifu', 'TwoXChromosomes', 'offmychest', 'todayilearned', 
                     'fffffffuuuuuuuuuuuu', 'TalesFromRetail', 'JusticePorn', 
                     'confession']:
    x = 'story'
  elif re.search('funny|comedy|changemyview|news|politic|atheis|religion|christian|islam|mormon', subreddit.lower()):
    x = 'news/life'
  elif re.search('sport|baseball|soccer|golf|football|basketball|nfl|nba|mlb', subreddit.lower()):
    x = 'sports'
  elif re.search('pics|videos', subreddit.lower()):
    x = 'pics/videos'
  else:
    x = 'other'
  
  return x

train['subreddit_group'] = train['subreddit'].map(group_subreddit)
test['subreddit_group'] = test['subreddit'].map(group_subreddit)

# Baseline pegasus

In [9]:
# download pegasus
pega_model = TFPegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

# model = TFPegasusForConditionalGeneration.from_pretrained("google/pegasus-reddit_tifu", from_pt=True)
# tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-reddit_tifu")

Downloading:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFPegasusForConditionalGeneration.

All the layers of TFPegasusForConditionalGeneration were initialized from the model checkpoint at google/pegasus-xsum.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFPegasusForConditionalGeneration for predictions without further training.


Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

In [11]:
model = pega_model
model.summary()

Model: "tf_pegasus_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFPegasusMainLayer)  multiple                  569748480 
                                                                 
Total params: 569,844,583
Trainable params: 569,748,480
Non-trainable params: 96,103
_________________________________________________________________


In [12]:
def pega_predict(text, model=model, tokenizer=tokenizer):
  inputs = tokenizer(text, max_length=1024, truncation=True, padding="max_length", return_tensors="tf")
  summary_ids = model.generate(inputs["input_ids"], num_beams=1, no_repeat_ngram_size=1, min_length=10, max_length=100)
  yhat = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
  return yhat

In [13]:
%%time
# make preds on a subset
test_small = test.iloc[:100]
test_small['yhat'] = test_small['content'].map(pega_predict)

CPU times: user 26min 30s, sys: 57.8 s, total: 27min 28s
Wall time: 26min 42s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [18]:
# takes 15:36 for 100 obs to run
print(15.5 * 60 / 100) # 9.3 sec per obs

# 2nd attempt took even longer: 26.5 min
print(26.5 * 60 / 100) # 16 sec per obs

9.3
15.9


In [19]:
# compare and compute rouge
yhat = test_small['yhat'].iloc[0]
y = test_small['summary'].iloc[0]

print("true:")
pprint(y)

print("\n\nprediction:")
pprint(yhat)

print("\n\nMetrics:")
# set up type of scores you want
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rouge3', 'rougeL'], use_stemmer=True)
scores = scorer.score(target=y, prediction=yhat)
scores

true:
'whiny emo bullshit/ humblebrag'


prediction:
('"Work yourself into a good state of mind and body", that\'s what I heard '
 'from my friend who works in the same industry as me.')


Metrics:


{'rouge1': Score(precision=0.0, recall=0.0, fmeasure=0.0),
 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0),
 'rouge3': Score(precision=0.0, recall=0.0, fmeasure=0.0),
 'rougeL': Score(precision=0.0, recall=0.0, fmeasure=0.0)}

In [20]:
# compute metrics for all test obs
metrics_results = {
    "rouge1_precision": []
    ,"rouge2_precision": []
    ,"rouge3_precision": []
    ,"rougeL_precision": []
    ,"rouge1_recall": []
    ,"rouge2_recall": []
    ,"rouge3_recall": []
    ,"rougeL_recall": []
    ,"rouge1_f1": []
    ,"rouge2_f1": []
    ,"rouge3_f1": []
    ,"rougeL_f1": []
}


scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rouge3', 'rougeL'], use_stemmer=True)
for y,yhat in zip(test_small['summary'].tolist(), test_small['yhat'].tolist()):

  res = scorer.score(target=y, prediction=yhat)
  metrics_results['rouge1_precision'].append(res['rouge1'][0])
  metrics_results['rouge2_precision'].append(res['rouge2'][0])
  metrics_results['rouge3_precision'].append(res['rouge3'][0])
  metrics_results['rougeL_precision'].append(res['rougeL'][0])
  metrics_results['rouge1_recall'].append(res['rouge1'][1])
  metrics_results['rouge2_recall'].append(res['rouge2'][1])
  metrics_results['rouge3_recall'].append(res['rouge3'][1])
  metrics_results['rougeL_recall'].append(res['rougeL'][1])
  metrics_results['rouge1_f1'].append(res['rouge1'][2])
  metrics_results['rouge2_f1'].append(res['rouge2'][2])
  metrics_results['rouge3_f1'].append(res['rouge3'][2])
  metrics_results['rougeL_f1'].append(res['rougeL'][2])


In [21]:
for key in metrics_results.keys():
  print(f'{key}: {np.mean(metrics_results[key]):.4f}')

rouge1_precision: 0.1627
rouge2_precision: 0.0166
rouge3_precision: 0.0027
rougeL_precision: 0.1251
rouge1_recall: 0.1790
rouge2_recall: 0.0128
rouge3_recall: 0.0019
rougeL_recall: 0.1472
rouge1_f1: 0.1361
rouge2_f1: 0.0133
rouge3_f1: 0.0022
rougeL_f1: 0.1071


# fine tune pegasus on subreddit (tensorflow)

In [22]:
train['subreddit_group'].value_counts()

other          21644
advice         16133
gaming          5124
news/life       3971
pics/videos     1251
story           1181
sports           696
Name: subreddit_group, dtype: int64

In [None]:
train_advice_small = train[train['subreddit_group'] == 'advice'].iloc[:10]

In [None]:
model_advice = TFPegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

Downloading:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFPegasusForConditionalGeneration.

All the layers of TFPegasusForConditionalGeneration were initialized from the model checkpoint at google/pegasus-xsum.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFPegasusForConditionalGeneration for predictions without further training.


Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

In [None]:
model_advice.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)



# fails
# x_input = tokenizer(train_advice_small['content'].tolist(), max_length=1024, truncation=True, padding="max_length", return_tensors="tf")
# y_input = tokenizer(train_advice_small['summary'].tolist(), max_length=1024, truncation=True, padding="max_length", return_tensors="tf")
# model_advice.fit(x=np.array(x_input['input_ids']), y=np.array(y_input['input_ids']), epochs=3) # fails
# model_advice.fit(x=x_input['input_ids'], y=y_input['input_ids'], epochs=3) # fails

# fails
# model_advice.fit(x=train_advice_small['content'].tolist(), y=train_advice_small['summary'].tolist(), epochs=3) # fails
# model_advice.fit(x=np.array(train_advice_small['content'].tolist()), y=np.array(train_advice_small['summary'].tolist()), epochs=3) # fails

# fails
# x_input = [np.array(i) for i in train_advice_small['content'].tolist()]
# y_input = [np.array(i) for i in train_advice_small['summary'].tolist()]
# model_advice.fit(x=x_input, y=y_input, epochs=3)

# try using tokenizer.as_target_tokenizer() for y
x_input = tokenizer(train_advice_small['content'].tolist(), max_length=1024, truncation=True, padding="max_length", return_tensors="tf")
y_input = tokenizer(train_advice_small['summary'].tolist(), max_length=1024, truncation=True, padding="max_length", return_tensors="tf")
model_advice.fit(x=np.array(x_input['input_ids']), y=np.array(y_input['input_ids']), epochs=3) # fails

model_advice.fit()

# Try pytorch

In [23]:
model_checkpoint = "google/pegasus-xsum"
# tokenizer = PegasusTokenizer.from_pretrained(model_checkpoint)
# model = TFPegasusForConditionalGeneration.from_pretrained(model_checkpoint)

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

train_advice_small = train[train['subreddit_group'] == 'advice'].iloc[:10]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.36M [00:00<?, ?B/s]

In [24]:
all_data = DatasetDict({
    'train': Dataset.from_dict({
        'content': train_advice_small['content'],
        'summary': train_advice_small['summary'],
        'subreddit': train_advice_small['subreddit'],
        'subreddit_group': train_advice_small['subreddit_group']
    })
})

all_data

DatasetDict({
    train: Dataset({
        features: ['content', 'summary', 'subreddit', 'subreddit_group'],
        num_rows: 10
    })
})

In [25]:
model_checkpoint = 'google/pegasus-xsum'

# tokenize summary and content
max_input_length = 1024
max_target_length = 128

# if we're using t5, append "summarize: " in front
if model_checkpoint in ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

def preprocess_function(examples, xvar='content', yvar='summary', max_input_length=max_input_length, max_target_length=max_target_length):
    inputs = [prefix + doc for doc in examples[xvar]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples[yvar], max_length=max_target_length, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# now tokenize our data
all_data_tokenized = all_data.map(preprocess_function, batched=True)



  0%|          | 0/1 [00:00<?, ?ba/s]

In [26]:
# set hyperparms
args = Seq2SeqTrainingArguments(
    output_dir = 'pegasus_round1', # file path to save checkpoints of the model. 
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3, # only make 3 checkpoint saves maximum through training process. 
    num_train_epochs=1,
    predict_with_generate=True, # use this to predict summaries. 
    # fp16=True, # default is false, whether to use 16-bit precision training instead of 32. Can only be used w/ CUDA error (gpu). 
    # push_to_hub=True,
)

# define how to compute metrics from preds. 
# We'll use rouge from the 'metric' object we defined above
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

# need this collator to pad the examples
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [27]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=all_data_tokenized['train'],
    eval_dataset=all_data_tokenized['train'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `PegasusForConditionalGeneration.forward` and have been ignored: content, summary, subreddit, subreddit_group. If content, summary, subreddit, subreddit_group are not expected by `PegasusForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1


RuntimeError: ignored

In [1]:
print(model)

NameError: ignored