In [None]:
from IPython.display import clear_output

!pip install datasets transformers rouge_score nltk

# !pip install -q sentencepiece
# !pip install rouge-score # google package version

clear_output()

In [None]:
import os
import re
import time
from tqdm import tqdm
import pandas as pd
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt

# pytorch dataset types
import datasets
from datasets.dataset_dict import DatasetDict
from datasets import Dataset, load_metric

import tensorflow as tf
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration # pegasus
from transformers import BartTokenizer, TFBartForConditionalGeneration # bart
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
data_dir = "/content/gdrive/MyDrive/Classes/W266_NLP/w266_reddit_summarization/data/tensorflow_datasets/reddit/1.0.0"
os.chdir(data_dir)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# define train/test split from the data. but don't read the data yet.
file_names = os.listdir(data_dir)
metadata_files = [i for i in file_names if re.search('^(?!reddit-train)', i)]

data_files = np.array([i for i in file_names if re.search('^reddit-train', i)])
split_ind = np.array([int(x[22:27]) % 5 for x in data_files])
test = data_files[split_ind == 0]
train = data_files[split_ind != 0]

In [None]:
# shorten the data for now, to make everything simpler
valid = train[4:5]
train = train[:3]
test = test[:2]

In [None]:
start = time.time()
def parse_file(serialized_example, return_xy=False):

  file_scruct = {
      'author': tf.io.FixedLenFeature([], tf.string),
      'body': tf.io.FixedLenFeature([], tf.string),
      'content': tf.io.FixedLenFeature([], tf.string),
      'id': tf.io.FixedLenFeature([], tf.string),
      'normalizedBody': tf.io.FixedLenFeature([], tf.string),
      'subreddit': tf.io.FixedLenFeature([], tf.string),
      'subreddit_id': tf.io.FixedLenFeature([], tf.string),
      'summary': tf.io.FixedLenFeature([], tf.string),
  }

  example_parsed = tf.io.parse_single_example(serialized_example, file_scruct)
  return example_parsed


# parse
train_tf = tf.data.TFRecordDataset(train).map(parse_file)
valid_tf = tf.data.TFRecordDataset(valid).map(parse_file)
test_tf = tf.data.TFRecordDataset(test).map(parse_file)

decode_string = np.vectorize(lambda x: x.decode('utf-8'))

# load data into memory into dictionary. 
# figure out how to bypass this
## we're doing tf -> np -> pt. Want to go tf -> pt
def tf_to_dict(tf_item):
  dataset_dict = {'content': [], 'summary': [], 'subreddit': []}
  for item in tf_item: 
    dataset_dict['content'].append(str(decode_string(item['content'].numpy())))
    dataset_dict['summary'].append(str(decode_string(item['summary'].numpy())))
    dataset_dict['subreddit'].append(str(decode_string(item['subreddit'].numpy())))
  return dataset_dict
# def tf_to_dict(tf_item):
#   dataset_dict = {'content': [], 'summary': [], 'subreddit': []}
#   for item in tf_item: 
#     dataset_dict['content'].append(item['content'])
#     dataset_dict['summary'].append(item['summary'])
#     dataset_dict['subreddit'].append(item['subreddit'])
#   return dataset_dict

# convert to pytorch Dataset datatype
all_data = DatasetDict({
    'train': Dataset.from_dict(tf_to_dict(train_tf)), 
    'valid': Dataset.from_dict(tf_to_dict(valid_tf)), 
    'test': Dataset.from_dict(tf_to_dict(test_tf))})

seconds_elapsed = time.time() - start
print(f"{seconds_elapsed/60} minutes elapsed")

1.1519737203915914 minutes elapsed


In [None]:
# visualize some examples
import random
import pandas as pd
from IPython.display import display, HTML


def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(
        dataset
    ), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset) - 1)
        while pick in picks:
            pick = random.randint(0, len(dataset) - 1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))


show_random_elements(all_data['train'])

Unnamed: 0,content,summary,subreddit
0,"I'd rather go on a subreddit where people are themselves. If r/trees is 50% happier and nicer than your average subreddit then great, that says something about trees and you can enjoy it. \n But just because you're on r/trees you don't have to ACT chill and ""uptoke"" things and pretend everyone is awesome just because they smoke. It's just sad and honestly a little bit creepy that you can only be like some sterotypical smoker from some movie or be exiled. \n Maybe I've strayed a little away from ent and more towards r/trees mentality but I hope you got the point anyway.",Be yourself and not some creepy super positive needy person just because that's what people on r/trees pretend to be for fun.,trees
1,"We used to be 4 of the best friends ever: me [40M], my friend ""Bob"" [38M], the woman I would eventually marry, ""Sheila"" [39F], and our former friend ""Bertha"" [31F]. We were basically an inseparable four and would do almost everything together. We all met each other around the same time, Sheila and I have always been romantically involved since the very beginning, and the other connections have always only been friendships. \n But those friendships were (or seemed to be) so tight that when Sheila and I were able to buy a house, we bought a duplex, largely so that Bob and Bertha, who were definitely not so well off, could have a comfortable place to live at reduced rent until they got back on their feet. Bertha moved into the second apartment at about half of what it would go for on the open market. Bob moved into the much larger main apartment with us and was using up one bedroom rent-free. \n No good deed goes unpunished, I suppose, and after an episode where Sheila and I both got angry at Bertha for having a very loud party at 3 AM on a weeknight, fucking up a common area without cleaning it up, risking ruining my brand new hardwood floors, she chose to retaliate by telling everyone in town (or at least in our social circle) that I was constantly trying to fuck her (you know, in the house I shared with my wife), and demanding her rent in blowjobs. \n When this came to light, I felt utterly violated. I kicked her out of the house immediately (yes, legally--- giving her 30 days notice, which is the minimum that was required where we live) and never spoke to her again. \n Bob remains friendly (although maybe less than before) with Bertha, and this seems to be my problem at the moment. Bertha's uncalled-for allegations were violating enough, so much that I emotionally feel like anyone who knows what happened and nonetheless gives her ""aid and comfort"" must also be my enemy. I don't want to lose Bob as a friend, but every time he sees Bertha or talks to her (the three of us ran into her at the bar last night and he struck up a conversation with her), I lose respect for him. I've gotten to the point where I've been thinking of asking him to leave the house. And... that would be really sad, given the friendship, nay bromance, we once had. \n I don't want that to happen, but I don't want to control him either by telling him who he's allowed to be friends with. But I also feel like he can't be friends with her AND expect my friendship as well. He likes to play both sides of issues... but that won't work for me this time.","Don't want to lose best friend who continues friendship with former friend who betrayed me. But don't want to tell him who he can or can't be friends with, either. \n EDIT: thanks for all the replies, everyone. I really appreciate the variety of outlooks and opinions. Just writing it down makes me feel a bit better. If the topic comes up, or if I feel shitty about it again, I'll know what to say. Everyone involved is a redditor, and this post is climbing, so I feel I need to take it down, but thank you again; I appreciate everyone's input.",relationships
2,"An OSO can't do anything for you until you're enrolled in college, really. I didn't call my OSO until a few months into my first semester of college. \n The application process for a commissioning program usually takes a couple months and consists of a PFT, background check, medical screening, interview with an OSO, etc. Eventually all of this is sent as a package to a selection board, who will decide whether or not you will be accepted into your chosen commissioning program. From there your next step will be to attend Officer Candidates School. \n The 'wait' time that you will encounter with officer programs is getting to TBS after you've commissioned. So you'll go to OCS at some point during college (or immediately after) and commission as soon as you have both graduated college and completed OCS. Once you commission your next step will be The Basic School in Quantico, VA (also where OCS is located). And that's where the waiting game comes into play. When I first enrolled in the program, wait times for TBS were anywhere from 12-18 months. Now they're down to 3-4 months. There's no way of telling what those wait times will look like 6 years from now when you're ready to go to TBS, though. During this time you're a 2nd Lt, but you are not active duty, so you will not be paid. Most LTs waiting to go to TBS find a job someplace. Time spent waiting for TBS counts towards the reserve portion of your contract. A typical contract is 4 years active duty, 4 years reserve. So if you have to wait a year for TBS, you will only have to do 4 years active, 3 years reserve once you get to TBS.","It's not going to hurt to talk to an OSO now, but there's not going to be much they can do for you while you're in high school. Focus on getting into college first.",USMCboot
3,"So my boyfriend and I have been together for around 7 months now....it's pretty serious and we're really in love. I can't imagine being with anyone besides him. We get along beautifully and he always knows how to make me feel better. \n However, the first two weeks of school, one when I was here and one when I wasn't, my boyfriend got so incredibly black out drunk and cheated on me twice.\nHe has a history of alcoholism in his family and he has always been a heavy drinker, so his blacking out is not uncommon.\nIn the first instance he was at school a week before it started so he was just partying. He got so drunk that a girl who raped him last year (before we were together) I guess found him again and the same situation played out. They had sex but he really didn't remember until this past week. \nThe other situation was him being blackout when I stayed in and his phone died so he couldn't answer my messages or calls and he called me at 4 AM from the floor of his apartment looking for me asking me where he had been all night and I, of course, couldn't answer. Turns out he had seen his ex girlfriend and she gave him a blowjob.\nIt honestly disgusts me so much that that happened but I have always chalked it up to his drinking. He's getting better with it, but now it just causes all of our fights because he is so ridiculous when he drinks. \n I am not sure what to do....","my boyfriend has history of alcoholism in his family and drinks heavily, we have great relationship except when he drinks and has cheated on me twice while under the influence; what the fuck do I do?",relationships
4,"Hello everyone! \n I have many questions for those on here and require much relationship advice and help in this situation. So buckle up because this is a long story.. \n First things first. I am 21 y/o and in my senior year of college. I dated this girl for a year and 8 months. We met at school. This past summer we broke up around July 6-8. There was a moment of time where I felt accepting of it because that's what u need to do after a breakup in order to move on. She would ask me many times to get back together but all I could say was ""I don't know yet"". But I knew all along and was just scared. \n Shortly after those talks, I texted her saying that I wanted to see her to try things again. She then responded saying ""I don't think that's a good idea"" I couldn't believe it. I was heartbroken. Heartbroken from the beginning. I committed heart suicide, I broke my own heart along with hers. \n A couple months later I couldn't take it anymore. She lives in New York and I live in Rhode Island. I took a day off of work and drove to New York, bought her roses and surprised her at her house. We talked and I told her everything and apologized for everything. I didn't expect her to get back with me right then and I told her that. She was very receptive of everything I had to say and we both cried to each other. However, she ultimately told me she wasn't ready because she was afraid I would break her heart again. I Understood. Before I left back for those island we hugged. I kissed her on the forehead and before I drove off she said ""I love you"". \n Weeks went by and I would call her and reach out to her saying things like ""how have you been?"", ""I miss you"" ""I've been thinking about you a lot lately"". You know, stuff like that. I wouldn't get much of a response like ""I miss you too"" instead I would get something like ""no worries, it's okay"". This then got me confused. \n Now it's about August 15. I have been in school already for a few days now for football camp and still haven't stopped thinking about her and thinking about myself and everything that I have done that I need to fix and change in order for us to be happy again. I wrote her a letter. It was extensive so long story short, the letter said ""I have learned a lot, I love you with everything and more, let me show and not just tell you."" \n This letter was left at her dorm as a surprise. She read the letter and texted me and said ""thank you for the letter, I appreciate it, it was very sweet."" That's all. \n The next day I went to go talk to her face to face. I laid it all on the line, my manhood, balls and heart. Trying with everything I could to convince her that I am different and I made a mistake by breaking up with her and that I learned so much from everything. She was receptive. However, She basically repeated the same things, this time though with a bunch of mannerisms that weren't her, there were pieces of her friend that showed. Is her friend convincing her and standing in our way? \n I took everything for what it is. She said no. But implemented words like ""not right now""... Those words are meaningful and provide a sense of hope for us. Skip another week. I went to go talk to her again. Asking her to go on a walk and just get away from everything. She didn't want to go. So we talked in her room again. I brought up all of our good memories and reminded her of everything we had that was great together. All she could do in rebuttal to those memories was bring up all of the bad memories. She felt as if I was forcing her to get back with me, I can understand that however I had no such intentions. I just wanted to show her how much I care and truly want to make things work. Still got a no, but again with the words ""not right now"". \n Now we reach today when in writing this. We've seen each other through school (small school). I say hi, she doesn't say hi to me. It's awkward. There is still some sort of strong connection there. I can feel it. She stares at me and watches me walk past her until I'm out of sight. I have caught her staring at me twice as well and she immediately looks away once I catch her. \n What is going on? What do I do? How much longer do I need to wait? Any advice, anything at all will help me. I have been an emotional mess since the break up. I made a mistake and pushed away true love and now trying to get it back.",made a mistake and broke up with girlfriend of year and 8 months. Chased her to get her back for almost 2 months. Not much progress. Been receiving mixed signals from her. Such as Words she said and her staring at me. Still feel a strong connection when I look at her. Is it reciprocal?,relationships


# Load BART

In [None]:
from transformers import BartTokenizer, TFBartForConditionalGeneration
checkpoint = 'facebook/bart-base' # "facebook/bart-large"
# "t5-small" # try t5 small later
model = TFBartForConditionalGeneration.from_pretrained(checkpoint)
tokenizer = BartTokenizer.from_pretrained(checkpoint)

# load the metric we'll use which is rouge
metric = load_metric('rouge')

Downloading:   0%|          | 0.00/1.68k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/532M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBartForConditionalGeneration.

All the layers of TFBartForConditionalGeneration were initialized from the model checkpoint at facebook/bart-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [None]:
# one example, no fine tuning
inputs = tokenizer(all_data['train']['content'][0], max_length=1024, truncation=True, return_tensors="tf", padding='max_length')
summary_ids = model.generate(inputs["input_ids"], 
                              num_beams=1,
                              no_repeat_ngram_size=1,
                              min_length=1,
                              max_length=150)
pprint(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False), compact=True)

['Here is something Olympic middle distance runner Nick Willis wrote about '
 'fellow Kiwi Sean Adams (Pitt) Â In the history of basketball in New Zealand, '
 'it has been a long time since there have ever had two players reach this '
 'level. The first was Steven Marks from Portland State University who went '
 'un-drafted and never played for an NBA team over his 11 season career with '
 'them until he made enough to be drafted by Boston College as well on their '
 'roster during that same year’s draft class at age 19 years old:Sean marks '
 'also reached great heights when playing college ball under coach Jamie '
 'Dixon; Mark Davis Jr., another former Oregon native came out victorious '
 'against Detroit Pistons after only one game while still being selected #1 '
 'overall pick']


In [None]:
# tokenize summary and content
max_input_length = 1024
max_target_length = 128

model_checkpoint = checkpoint

# if we're using t5, append "summarize: " in front
if model_checkpoint in ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

def preprocess_function(examples, xvar='content', yvar='summary', max_input_length=1024, max_target_length=128):
    inputs = [prefix + doc for doc in examples[xvar]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples[yvar], max_length=max_target_length, truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# now tokenize our data
all_data_tokenized2 = all_data.map(preprocess_function, batched=True)



  0%|          | 0/46 [00:00<?, ?ba/s]

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/31 [00:00<?, ?ba/s]

In [None]:
# inspect. so now the input_ids are tokenized inputs. and the labels are tokenized summaries. 
all_data_tokenized2['train']

Dataset({
    features: ['content', 'summary', 'subreddit', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 45097
})

In [None]:
# set params
batch_size = 8
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 1

# use collator to convert our PT dataset to TF dataset
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

# NOTE, I'M TRUNCATING THE DATA TO ONLY 10 OBS TO SPEED THIS UP
# REMOVE THE .select() LATER
train_dataset = all_data_tokenized2["train"].select(range(10)).to_tf_dataset(
    batch_size=batch_size,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    collate_fn=data_collator,
)

validation_dataset = all_data_tokenized2["valid"].select(range(10)).to_tf_dataset(
    batch_size=8,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    collate_fn=data_collator,
)

In [None]:
from transformers import AdamWeightDecay

optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
# model.compile(optimizer=optimizer)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

model.fit(train_dataset, validation_data=validation_dataset, epochs=1)



<keras.callbacks.History at 0x7f39fbce9710>

In [None]:
print("Input post:")
pprint(all_data_tokenized2["valid"]['content'][0])
print("\ndetokenizing")
pprint(tokenizer.batch_decode(all_data_tokenized2['valid']['input_ids'][0], 
                              skip_special_tokens=True, clean_up_tokenization_spaces=False), compact=True)

Input post:
('Since I am blessed with the ability to use /gamemode, I plan to be in '
 'survival as much as possible when 0.9.0 comes out. I tried with this current '
 'world, but the temptation was too great!  Personally, I like to challenge '
 'myself with odd and gigantic builds. It helps keep me focused, and (I hope ) '
 'serves as an example of what could be done by other players who still think '
 'in terms of rectangular structures. Up until now, creative has been the '
 'fastest and most efficient way to do this, while still being able to police '
 'the world and help others with my limited playing time.')

detokenizing
['<s>', 'Since', ' I', ' am', ' blessed', ' with', ' the', ' ability', ' to',
 ' use', ' /', 'gam', 'em', 'ode', ',', ' I', ' plan', ' to', ' be', ' in',
 ' survival', ' as', ' much', ' as', ' possible', ' when', ' 0', '.', '9', '.',
 '0', ' comes', ' out', '.', ' I', ' tried', ' with', ' this', ' current',
 ' world', ',', ' but', ' the', ' temptation', ' was', 

Compiled model not working...

In [None]:
# make prediction:
# maybe figure out how to pass in the pre encoded input: all_data_tokenized2
inputs = tokenizer(all_data['valid']['content'][0], max_length=1024, truncation=True, return_tensors="tf", padding='max_length')
summary_ids = model.generate(inputs, 
                              num_beams=1,
                              no_repeat_ngram_size=1,
                              min_length=1,
                              max_length=150)
print("Predicted summary:")
pprint(tokenizer.batch_decode(summary_ids, 
                              skip_special_tokens=True, clean_up_tokenization_spaces=False), compact=True)

print("\nTrue summary:")
pprint(all_data_tokenized2["valid"]['summary'][0])

pprint("\nTrue summary decoded:")
pprint(tokenizer.batch_decode(all_data_tokenized2['valid']['labels'][0], 
                              skip_special_tokens=True, clean_up_tokenization_spaces=False), compact=True)

AttributeError: ignored

In [None]:
summary_ids = model.generate(validation_dataset, 
                              num_beams=1,
                              no_repeat_ngram_size=1,
                              min_length=1,
                              max_length=150)


AttributeError: ignored

# Try pytorch

In [None]:
# we already have our tokenized dataset to use here: all_data_tokenized2
# restart model:
checkpoint = 'facebook/bart-base' 
#model = TFBartForConditionalGeneration.from_pretrained(checkpoint)
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading:   0%|          | 0.00/532M [00:00<?, ?B/s]

In [None]:
# set hyperparms
args = Seq2SeqTrainingArguments(
    output_dir = 'finetuned_bart_v1', # file path to save checkpoints of the model. 
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3, # only make 3 checkpoint saves maximum through training process. 
    num_train_epochs=1,
    predict_with_generate=True, # use this to predict summaries. 
    fp16=True, # default is false, whether to use 16-bit precision training instead of 32
    # push_to_hub=True,
)

In [None]:
# define how to compute metrics from preds. 
# We'll use rouge from the 'metric' object we defined above
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

# need this collator to pad the examples
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=all_data_tokenized2["train"],
    eval_dataset=all_data_tokenized2["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

RuntimeError: ignored