# Setups

In [1]:
from IPython.display import clear_output

!pip install datasets transformers rouge_score nltk
# !pip install -q sentencepiece
# !pip install rouge-score # google package version

clear_output()

In [2]:
import os
import re
import time
from tqdm import tqdm
import pandas as pd
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt

import nltk

# pytorch dataset types
import datasets
from datasets.dataset_dict import DatasetDict
from datasets import Dataset, load_metric, load_dataset

import tensorflow as tf
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration # pegasus
from transformers import BartTokenizer, TFBartForConditionalGeneration # bart

# pytorch bart stuff
import torch
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer

In [3]:
# torch train says to run this
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Read data

In [4]:
start = time.time()

from google.colab import drive
drive.mount('/content/gdrive')
data_dir = "/content/gdrive/MyDrive/Classes/W266_NLP/w266_reddit_summarization/data/tensorflow_datasets/reddit/1.0.0"
os.chdir(data_dir)

# define train/test split from the data. but don't read the data yet.
file_names = os.listdir(data_dir)
metadata_files = [i for i in file_names if re.search('^(?!reddit-train)', i)]

data_files = np.array([i for i in file_names if re.search('^reddit-train', i)])
split_ind = np.array([int(x[22:27]) % 5 for x in data_files])
test_files = data_files[split_ind == 0]
train_files = data_files[split_ind != 0]

# split further into train/valid
valid_files = train_files[int(np.ceil(len(train_files) * .8)):]
train_files = train_files[:int(np.ceil(len(train_files) * .8))]

# downsample to speed things up
subset_size = .02
train_files = train_files[:int(np.ceil(len(train_files) * subset_size))]
test_files = test_files[:int(np.ceil(len(test_files) * subset_size))]
valid_files = valid_files[:int(np.ceil(len(valid_files) * subset_size))]

# parse files
def parse_file(serialized_example, return_xy=False):

  file_scruct = {
      # 'author': tf.io.FixedLenFeature([], tf.string),
      # 'body': tf.io.FixedLenFeature([], tf.string),
      'content': tf.io.FixedLenFeature([], tf.string),
      # 'id': tf.io.FixedLenFeature([], tf.string),
      # 'normalizedBody': tf.io.FixedLenFeature([], tf.string),
      'subreddit': tf.io.FixedLenFeature([], tf.string),
      # 'subreddit_id': tf.io.FixedLenFeature([], tf.string),
      'summary': tf.io.FixedLenFeature([], tf.string),
  }

  example_parsed = tf.io.parse_single_example(serialized_example, file_scruct)
  return example_parsed

# parse
train_tf = tf.data.TFRecordDataset(train_files).map(parse_file)
valid_tf = tf.data.TFRecordDataset(valid_files).map(parse_file)
test_tf = tf.data.TFRecordDataset(test_files).map(parse_file)

decode_string = np.vectorize(lambda x: x.decode('utf-8'))

# load data into memory into dictionary. 
# figure out how to bypass this
## we're doing tf -> np -> pt. Want to go tf -> pt
def tf_to_dict(tf_item):
  dataset_dict = {'content': [], 'summary': [], 'subreddit': []}
  for item in tqdm(tf_item): 
    dataset_dict['content'].append(str(decode_string(item['content'].numpy())))
    dataset_dict['summary'].append(str(decode_string(item['summary'].numpy())))
    dataset_dict['subreddit'].append(str(decode_string(item['subreddit'].numpy())))
  return dataset_dict
# def tf_to_dict(tf_item):
#   dataset_dict = {'content': [], 'summary': [], 'subreddit': []}
#   for item in tf_item: 
#     dataset_dict['content'].append(item['content'])
#     dataset_dict['summary'].append(item['summary'])
#     dataset_dict['subreddit'].append(item['subreddit'])
#   return dataset_dict

# convert to pytorch Dataset datatype
all_data = DatasetDict({
    'train': Dataset.from_dict(tf_to_dict(train_tf)), 
    'valid': Dataset.from_dict(tf_to_dict(valid_tf)), 
    'test': Dataset.from_dict(tf_to_dict(test_tf))})

seconds_elapsed = time.time() - start
print(f"{seconds_elapsed/60} minutes elapsed")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
1.1387099107106526 minutes elapsed


In [None]:
# download model and tokenizer
model_checkpoint = 'facebook/bart-base' 
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/1.68k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/532M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

# Data Clean

In [None]:
all_data

DatasetDict({
    train: Dataset({
        features: ['content', 'summary', 'subreddit'],
        num_rows: 135292
    })
    valid: Dataset({
        features: ['content', 'summary', 'subreddit'],
        num_rows: 30064
    })
    test: Dataset({
        features: ['content', 'summary', 'subreddit'],
        num_rows: 45099
    })
})

In [None]:
# tokenize summary and content
max_input_length = 1024
max_target_length = 128

# if we're using t5, append "summarize: " in front
if model_checkpoint in ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

def preprocess_function(examples, xvar='content', yvar='summary', max_input_length=1024, max_target_length=128):
    inputs = [prefix + doc for doc in examples[xvar]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples[yvar], max_length=max_target_length, truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# now tokenize our data
all_data_tokenized = all_data.map(preprocess_function, batched=True)



  0%|          | 0/136 [00:00<?, ?ba/s]

  0%|          | 0/31 [00:00<?, ?ba/s]

  0%|          | 0/46 [00:00<?, ?ba/s]

In [None]:
# now the content is converted to 'input_ids' and 'attention_mask'.
# and 'summary' is converted to 'lables'
all_data_tokenized

DatasetDict({
    train: Dataset({
        features: ['content', 'summary', 'subreddit', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 135292
    })
    valid: Dataset({
        features: ['content', 'summary', 'subreddit', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 30064
    })
    test: Dataset({
        features: ['content', 'summary', 'subreddit', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 45099
    })
})

In [None]:
# set hyperparms
args = Seq2SeqTrainingArguments(
    output_dir = 'finetuned_bart_v1', # file path to save checkpoints of the model. 
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3, # only make 3 checkpoint saves maximum through training process. 
    num_train_epochs=1,
    predict_with_generate=True, # use this to predict summaries. 
    # fp16=True, # default is false, whether to use 16-bit precision training instead of 32. Can only be used w/ CUDA error (gpu). 
    # push_to_hub=True,
)

# define how to compute metrics from preds. 
# We'll use rouge from the 'metric' object we defined above
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

# need this collator to pad the examples
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# SELECTING A SUBSET TO SPEED THIS UP
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=all_data_tokenized['train'].select(range(10)),
    eval_dataset=all_data_tokenized['valid'].select(range(5)),
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, content, subreddit. If summary, content, subreddit are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1


Epoch,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, content, subreddit. If summary, content, subreddit are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5
  Batch size = 16


LookupError: ignored

In [None]:
trainer

In [None]:
# says to run this:
import nltk
nltk.download('punkt')