In [1]:
from IPython.display import clear_output

!pip install datasets transformers rouge_score nltk

# !pip install -q sentencepiece
# !pip install rouge-score

clear_output()

In [9]:
import os
import re
import time
from tqdm import tqdm
import pandas as pd
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt

# pytorch dataset types
import datasets
from datasets.dataset_dict import DatasetDict
from datasets import Dataset, load_metric

import tensorflow as tf
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration # pegasus
from transformers import BartTokenizer, TFBartForConditionalGeneration # bart
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')
data_dir = "/content/gdrive/MyDrive/Classes/W266_NLP/w266_reddit_summarization/data/tensorflow_datasets/reddit/1.0.0"
os.chdir(data_dir)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
# define train/test split from the data. but don't read the data yet.
file_names = os.listdir(data_dir)
metadata_files = [i for i in file_names if re.search('^(?!reddit-train)', i)]

data_files = np.array([i for i in file_names if re.search('^reddit-train', i)])
split_ind = np.array([int(x[22:27]) % 5 for x in data_files])
test = data_files[split_ind == 0]
train = data_files[split_ind != 0]

In [5]:
# shorten the data for now, to make everything simpler
valid = train[4:5]
train = train[:3]
test = test[:2]

In [30]:
start = time.time()
def parse_file(serialized_example, return_xy=False):

  file_scruct = {
      'author': tf.io.FixedLenFeature([], tf.string),
      'body': tf.io.FixedLenFeature([], tf.string),
      'content': tf.io.FixedLenFeature([], tf.string),
      'id': tf.io.FixedLenFeature([], tf.string),
      'normalizedBody': tf.io.FixedLenFeature([], tf.string),
      'subreddit': tf.io.FixedLenFeature([], tf.string),
      'subreddit_id': tf.io.FixedLenFeature([], tf.string),
      'summary': tf.io.FixedLenFeature([], tf.string),
  }

  example_parsed = tf.io.parse_single_example(serialized_example, file_scruct)
  return example_parsed


# parse
train_tf = tf.data.TFRecordDataset(train).map(parse_file)
valid_tf = tf.data.TFRecordDataset(valid).map(parse_file)
test_tf = tf.data.TFRecordDataset(test).map(parse_file)

decode_string = np.vectorize(lambda x: x.decode('utf-8'))

# load data into memory into dictionary. 
# figure out how to bypass this
## we're doing tf -> np -> pt. Want to go tf -> pt
def tf_to_dict(tf_item):
  dataset_dict = {'content': [], 'summary': [], 'subreddit': []}
  for item in tf_item: 
    dataset_dict['content'].append(str(decode_string(item['content'].numpy())))
    dataset_dict['summary'].append(str(decode_string(item['summary'].numpy())))
    dataset_dict['subreddit'].append(str(decode_string(item['subreddit'].numpy())))
  return dataset_dict
# def tf_to_dict(tf_item):
#   dataset_dict = {'content': [], 'summary': [], 'subreddit': []}
#   for item in tf_item: 
#     dataset_dict['content'].append(item['content'])
#     dataset_dict['summary'].append(item['summary'])
#     dataset_dict['subreddit'].append(item['subreddit'])
#   return dataset_dict

# convert to pytorch Dataset datatype
all_data = DatasetDict({
    'train': Dataset.from_dict(tf_to_dict(train_tf)), 
    'valid': Dataset.from_dict(tf_to_dict(valid_tf)), 
    'test': Dataset.from_dict(tf_to_dict(test_tf))})

seconds_elapsed = time.time() - start
print(f"{seconds_elapsed/60} minutes elapsed")

0.7704395135243733 minutes elapsed


In [31]:
# visualize some examples
import random
import pandas as pd
from IPython.display import display, HTML


def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(
        dataset
    ), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset) - 1)
        while pick in picks:
            pick = random.randint(0, len(dataset) - 1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))


show_random_elements(all_data['train'])

Unnamed: 0,content,summary,subreddit
0,"Personally, I don't think camping is as big of a problem as it was in Reach due to the new ordnance drops. On maps like Hemorrhage, where the sniper rifle routinely spawned in the same location, people would camp for the entire match just to get their hands on it. There aren't many long, open maps in Halo 4 , so that sort of eliminates any chance of camping in the back of maps and sniping all the way across.",Maps in Halo 4 are less wide-open than in Reach and it's harder to camp with a sniper and go 27-0.,halo
1,"The thing is, what's viable/competitive now won't necessarily be viable/competitive weeks or months from now, so it's hard to make recommendations based on that. The best thing would be just to try champions during free weeks and buy the ones you find fun. A lot of people get hung up over who the flavour of the month champions are, or what champions are ""hard counters"" to other champions, but that doesn't really count for much if you're playing a champion you're bad at just because someone says they're good.",play what you find fun,leagueoflegends
2,"I was in the super bowl in offline franchise and it went to overtime. The cpu got to choose to reciever first, so that leaves me with the code of which side to defend. \n Now the wind has played a huge factor so far in the first 4 quarters so I knew I had to kick with the wind for a gane winning field goal. \n Here's the scenario, my options are defend left or defend right. \n These option are on top of eachother square being defend left, underneath it is defend right. \n And you wanna know what the fucking wind direction says? It's an arrow point at 280 degrees (down, slightly to the right). Oh and the camera view is of one touchdown in the background.",SO MY QUESTION IS HOW WAS I SUPPOSED TO KNOW WHICH WAY THE WIND IS BLOWING? \n I ended up getting the 16 mph wind blowing right in my fucking face bc I didn't know what to choose and I only had 3 seconds to figure out ea's bullshit. So my kickoff barely made it to their 10 and they had the wind at their back for longer field goals. Thats fucking stupid,Madden
3,We need to let Bethesda know the combat zone could be so much more. Adding a quest to turn it into a legit thing apart of any upcoming dlc or maybe even a free update either one really. Looking back one of my favorite things to do in Oblivion was the arena oh boy was that amazing. Fallout 3 had the pitt dlc which was okayish. Now looking forward Skyrim was missing an arena(they had one that got scrapped) and maybe they did have plans for the combat zone who knows.,Make combat zone a workable arena pls.,fo4
4,"Alright, a little background info here. My lovely Partner and I are live-in care assistants to a disabled friend of the family, Boss. She's had a few strokes and is very physically disabled, but is still as mentally sharp as she used to be. \n Now, she knows I'm a transman. She claims she's a big ally of the entire LGBTQILMNOP community, so when we first started working for her I felt comfortable telling her. Turns out she was doing a term paper for her graduate psych class on LGBT youth or something like that, and decided she wanted me and Partner to be her main interviewed subjects. Now, she knows my story (I came out to a very unreceptive audience and have generally had a rough time with being out), but in this interview she decided to make me relive every single moment of my trans-life. All those things you hate to be asked? She asked them. ""What was your old name?"" ""Have you had any surgeries?"" ""Have you been assaulted?"" ""Are your relatives okay with your choice to transition?"" You know, all those old cans of worms you spend so long trying to suppress. \n Along with that glaring example of shenanigans, there's also a bunch of just little remarks that she makes that just stab right through me. Here's just a few examples. \n \n The other day the 3 of us went to the store the other day. We went by the feminine products aisle, and Boss asked ""Do we need to get any of this stuff?"" Partner said ""No, I'm good"" and left it at that (she keeps up with that kind of stuff for the two of us). Boss went on to ask ""How about you, Phil? Do you need anything?"" in front of at least 4 or 5 other people on that particular aisle. I was so taken aback I couldn't even reply. \n One of those commercials for testosterone supplements aimed at middle aged men came on the TV. I joked to Partner ""That's my problem, low testosterone! Get me the phone, I'll order some."" Boss decides to pipe up ""You know they won't sell any to people like you, right?"" Oh shit, really? Well golly gee, I didn't figure, thanks for saving me from a ton of embarrassment! \n A while back I was feeling very ill so I decided not to wear my binder around the house (cause a binder plus a respiratory infection equals pain). I hadn't left mine and Partern's room all day cause she'd been taking care of me, but I was after a glass of water. So I decide to scuttle to the kitchen to grab it and run right back. I had on 2 shirts and a black hoodie, so it's not like errything was out and about. Boss sees me and decides to exclaim loudly ""Phil! Look at that, you're not wearing your binder!"" then proceeded to ask me a dozen questions about it, ignoring my very obvious discomfort. \n She also likes to give me tips on how to pass. Now, I pass pretty well. I'd put it at maybe 90% success rate (#winning). And yet she always has a little protip about passing. ""If you get a hair cut, you'll probably look more masculine."" ""Maybe you should put on another baggy shirt today."" ""You look like you've gained some weight, maybe you should lose it to look more masculine."" \n \n I dunno, maybe I'm just overreacting and real sensitive about the whole thing, but it's driven me to the point of madness. It just seems like the whole transgender thing is so prevalent in her eyes. I'm not a man, I'm a TRANSman, and she won't seem to let me forget it. Do y'all think I'm overreacting about this whole thing? And how can I politely bring up the fact that she's pissing me off and makin an ass out of herself without being rude about it? \n Edit for ze updates: Well, we had a nice chat about it. She didn't seem to understand before (or after) our conversation that what she was saying was pretty much stupid and making me grossly uncomfortable. It seems that she believes that since she claims to be bisexual (I am skeptical), she's automatically a savant in the area of all things LGBT. She's still being pretty ignorant at the speed of light (honest to God asked Partner if the sex was okay since I don't have manly bits [spoiler alert: I do, they just come with a harness]). I think this is gonna be an ongoing project here. But a huge thanks for all your advice, y'all!",Boss can say very rude and ignorant things and I can't tell if I'm being over sensitive or if she's really a jerk. What do?,transgender


# Load BART

In [32]:
from transformers import BartTokenizer, TFBartForConditionalGeneration
model = TFBartForConditionalGeneration.from_pretrained("facebook/bart-large")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")

# load the metric we'll use which is rouge
metric = load_metric('rouge')

Downloading:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBartForConditionalGeneration.

All the layers of TFBartForConditionalGeneration were initialized from the model checkpoint at facebook/bart-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [34]:
# one example, no fine tuning
inputs = tokenizer(all_data['train']['content'][0], max_length=1024, truncation=True, return_tensors="tf", padding='max_length')
summary_ids = model.generate(inputs["input_ids"], 
                              num_beams=1,
                              no_repeat_ngram_size=1,
                              min_length=1,
                              max_length=150)
pprint(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False), compact=True)

['Here is something Olympic middle distance runner Nick Willis wrote about '
 'fellow Kiwi Sean Adams (Pitt)  .. Basketball fans in New Zealand look once '
 'again to have a kiwhi who will be able play for the Detroit Pistons, and '
 'become one of their own on American hardwood! In terms or NBA draft history '
 'though – it’s been quite some time since we had an Australian player go pro; '
 'as far back at least 15 years ago when Kirk Penny was drafted by Miami '
 'Heat…and up until now I would say that aside from Steven “The Kid From '
 'Rotorua″ Smith ,New Zealander basketball fan here has never seen anything '
 'like this before: Up till recently–in fact over 20 seasons old-that']


In [81]:
# tokenize data to train the model
def tokenize_fun(x):
  return tokenizer(x['content'], max_length=1024, truncation=True, padding='max_length') #return_tensors="tf"

all_data_tokenized = all_data.map(tokenize_fun)

  0%|          | 0/45097 [00:00<?, ?ex/s]

  0%|          | 0/15032 [00:00<?, ?ex/s]

  0%|          | 0/30066 [00:00<?, ?ex/s]

In [82]:
all_data_tokenized

DatasetDict({
    train: Dataset({
        features: ['content', 'summary', 'subreddit', 'input_ids', 'attention_mask'],
        num_rows: 45097
    })
    valid: Dataset({
        features: ['content', 'summary', 'subreddit', 'input_ids', 'attention_mask'],
        num_rows: 15032
    })
    test: Dataset({
        features: ['content', 'summary', 'subreddit', 'input_ids', 'attention_mask'],
        num_rows: 30066
    })
})

In [35]:
# tokenize it differently so it also tokenizes the summary
max_input_length = 1024
max_target_length = 128

model_checkpoint = 'facebook/bart-large'

# if we're using t5, append "summarize: " in front
if model_checkpoint in ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

def preprocess_function(examples, xvar='content', yvar='summary', max_input_length=1024, max_target_length=128):
    inputs = [prefix + doc for doc in examples[xvar]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples[yvar], max_length=max_target_length, truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# now tokenize our data
all_data_tokenized2 = all_data.map(preprocess_function, batched=True)



  0%|          | 0/46 [00:00<?, ?ba/s]

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/31 [00:00<?, ?ba/s]

In [36]:
# inspect. so now the input_ids are tokenized inputs. and the labels are tokenized summaries. 
all_data_tokenized2['train']

Dataset({
    features: ['content', 'summary', 'subreddit', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 45097
})

In [44]:
# set params
batch_size = 8
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 1

# use collator to convert our PT dataset to TF dataset
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

# NOTE, I'M TRUNCATING THE DATA TO ONLY 10 OBS TO SPEED THIS UP
# REMOVE THE .select() LATER
train_dataset = all_data_tokenized2["train"].select(range(10)).to_tf_dataset(
    batch_size=batch_size,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    collate_fn=data_collator,
)

validation_dataset = all_data_tokenized2["valid"].select(range(10)).to_tf_dataset(
    batch_size=8,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    collate_fn=data_collator,
)

In [48]:
from transformers import AdamWeightDecay

optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
# model.compile(optimizer=optimizer)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)


model.fit(train_dataset, validation_data=validation_dataset, epochs=1)

ResourceExhaustedError: ignored