#Load Dataset

In [None]:
## The dataset consists of 3,848,330 posts with an average length of 270 words for content, and 28 words for the summary.
## Min = 20 words Max = 50 words
import tensorflow_datasets as tfds 
data = tfds.load(name='reddit')

#Import and Parse Dataset

In [2]:
# all imports
import os
import re
import tensorflow as tf
import numpy as np 
import pandas as pd
from pprint import pprint
from tqdm import tqdm

import tensorflow_datasets as tfds 
from transformers import TFAutoModelForSequenceClassification
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration


In [3]:
!pip install -q transformers

In [4]:
!pip install -q sentencepiece

In [5]:
!pip install -q datasets

In [6]:
# so now data is in here
# the data sits on disk, and not memory
path = '/root/tensorflow_datasets/reddit/1.0.0/'
os.chdir(path)

file_names = os.listdir(path)
metadata_files = [i for i in file_names if re.search('^(?!reddit-train)', i)]
train_records = [i for i in file_names if re.search('^reddit-train', i)]

# create small subset for inspecting
train_records_small = train_records[:10]

In [7]:
# Parsing it into a json format
raw_dataset = tf.data.TFRecordDataset(train_records_small)

# limit 2 examples
for i, raw_record in enumerate(raw_dataset.take(1)):
  example = tf.train.Example()
  example.ParseFromString(raw_record.numpy())
  print(example)
  if i > 1:
    break


features {
  feature {
    key: "author"
    value {
      bytes_list {
        value: "moleculariant"
      }
    }
  }
  feature {
    key: "body"
    value {
      bytes_list {
        value: "Japan is basically a thin strip of islands on the edge of Asia. While rich in culture, it is important to understand that overpopulation is likely, given that Japanese heritage is not only coveted by cultures across the globe, it is famous. When I say famous, I mean to express the concept that during and after WW2, America came to know and love the culture of Japan in a way we never understood prior to the government\'s clashing. It makes sense now that while Japan remains a sovereign government, western ideas and ideologies have been adopted, yet the land mass of Japan cannot sustain this America/Euro proclivity, so the Japanese government has promoted breeding, and as a result has led to overpopulation.\n\nTL;DR Japanese people rock, but there is only so much Japan."
      }
    }
  }
  feat

In [8]:
# parse the output by defining the var types
# this seems like the best way so far.
def parse_file(serialized_example):

  file_scruct = {
      'author': tf.io.FixedLenFeature([], tf.string),
      'body': tf.io.FixedLenFeature([], tf.string),
      'content': tf.io.FixedLenFeature([], tf.string),
      'id': tf.io.FixedLenFeature([], tf.string),
      'normalizedBody': tf.io.FixedLenFeature([], tf.string),
      'subreddit': tf.io.FixedLenFeature([], tf.string),
      'subreddit_id': tf.io.FixedLenFeature([], tf.string),
      'summary': tf.io.FixedLenFeature([], tf.string),
  }

  example1 = tf.io.parse_single_example(serialized_example, file_scruct)
  return example1

# parse:
dataset = tf.data.TFRecordDataset(train_records_small).map(parse_file)


# print it out
# this prints out too many
# for item in dataset:
#   print(item['author'], item['summary'], item['subreddit'])

# just print a few
for i, item in enumerate(dataset):
  tup1 = (item['author'], item['body'], item['content'], item['summary'], item['subreddit'])
  print(tup1)
  if i >= 10:
    break

(<tf.Tensor: shape=(), dtype=string, numpy=b'moleculariant'>, <tf.Tensor: shape=(), dtype=string, numpy=b"Japan is basically a thin strip of islands on the edge of Asia. While rich in culture, it is important to understand that overpopulation is likely, given that Japanese heritage is not only coveted by cultures across the globe, it is famous. When I say famous, I mean to express the concept that during and after WW2, America came to know and love the culture of Japan in a way we never understood prior to the government's clashing. It makes sense now that while Japan remains a sovereign government, western ideas and ideologies have been adopted, yet the land mass of Japan cannot sustain this America/Euro proclivity, so the Japanese government has promoted breeding, and as a result has led to overpopulation.\n\nTL;DR Japanese people rock, but there is only so much Japan.">, <tf.Tensor: shape=(), dtype=string, numpy=b"Japan is basically a thin strip of islands on the edge of Asia. While

In [9]:
# get data ready for summarizing

dataset = tf.data.TFRecordDataset(train_records_small).map(parse_file)
dataset_dict = {'content': [], 'summary': []}

decode_string = np.vectorize(lambda x: x.decode('utf-8'))

for item in dataset: 
  dataset_dict['content'].append(decode_string(item['content'].numpy()))
  dataset_dict['summary'].append(decode_string(item['summary'].numpy()))

#Fine-tune Pegasus Model

In [10]:
## Tokenize dataset to fine-tune model (https://huggingface.co/transformers/v4.9.2/training.html)
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

In [11]:
##Is this step necessary?
#def tokenize_function(examples):
#    return tokenizer(examples["content"], padding="max_length", truncation=True)

#tokenized_datasets = dataset_dict.map(tokenize_function)

In [None]:
##Pretrain and compile model
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration

pega_model = TFPegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

pega_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

pega_model.fit(dataset_dict['content'], validation_data=dataset_dict['summary'], epochs=3)

All model checkpoint layers were used when initializing TFPegasusForConditionalGeneration.

All the layers of TFPegasusForConditionalGeneration were initialized from the model checkpoint at google/pegasus-xsum.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFPegasusForConditionalGeneration for predictions without further training.


In [None]:
pega_model.save_pretrained("my_pega_model")

In [13]:
pega_model.summary()

Model: "tf_pegasus_for_conditional_generation_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFPegasusMainLayer)  multiple                  569748480 
                                                                 
Total params: 569,844,583
Trainable params: 569,748,480
Non-trainable params: 96,103
_________________________________________________________________


#Test Model

In [14]:
inputs = tokenizer(np.array2string(dataset_dict['content'][1]), max_length=1024, truncation=True, return_tensors="tf")
inputs['input_ids'].shape

TensorShape([1, 138])

In [19]:
# Generate Summary
summary_ids = pega_model.generate(inputs["input_ids"], 
                              num_beams=4,
                              no_repeat_ngram_size=2,
                              min_length=20,
                              max_length=50)
print("content:")
print(dataset_dict['content'][1])

print("\n\true:")
print(dataset_dict['summary'][1])

print("\n\nprediction:")
pprint(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0], compact=True)


\content:
Depends on how you measure. Adjusting levels to make the nip-outline a little more clear indicates that the right nipple (including what appears to be the areola) is a circle with a radius of 8 or 9 pixels. If we measure the radius of clinton's eye from the center of the pupil to the outside of the iris, I find get a circle with a radius of 6 or 7 pixels. Of course there are confounding factors at play here. Portman's nipples appear to be slightly closer to the camera, and I'm measuring the radius of the shadow on cast on the shirt which is merely a correlate of nipple size.

	rue:
Natalie's nips are (pending further investigation) slightly larger than Hillary's irises 
 EDIT: By "right" nipple, I mean right with respect to the viewer. In other words, Natalie Portman's left nipple 
 EDIT EDIT: Further research has turned up the following information. This  study . Even more interesting, they found a proportion of 1:3.4 for Breast/Areola size. Thus if someone could find the a