#Steps to save data as parquet

In [3]:
from IPython.display import clear_output

!pip install datasets transformers rouge_score nltk
!pip install pyarrow
# !pip install -q sentencepiece
# !pip install rouge-score # google package version

clear_output()

In [4]:
import os
import re
import time
from tqdm import tqdm
import pandas as pd
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt

# nlp stuff
import nltk

# tf stuff
import tensorflow_datasets as tfds 
import tensorflow as tf
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration # pegasus
from transformers import BartTokenizer, TFBartForConditionalGeneration # bart

# pytorch dataset types
import datasets
from datasets.dataset_dict import DatasetDict
from datasets import Dataset, load_metric, load_dataset

# pytorch bart stuff
import torch
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer

In [5]:
from google.colab import drive
drive.mount('/content/gdrive')
write_path ="/content/gdrive/MyDrive/"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [6]:
start = time.time()
raw_datasets = load_dataset("reddit")
print(f"{(time.time() - start)/60} minutes elapsed")

Using custom data configuration default
Reusing dataset reddit (/root/.cache/huggingface/datasets/reddit/default/1.0.0/98ba5abea674d3178f7588aa6518a5510dc0c6fa8176d9653a3546d5afcb3969)


  0%|          | 0/1 [00:00<?, ?it/s]

0.06976009209950765 minutes elapsed


In [7]:
# slice it up and save it in chunks
# doing 500k chunks
total_obs = len(raw_datasets['train']['subreddit'])
subset_chunk = np.arange(0, total_obs, step=500000)
subset_chunk = np.append(subset_chunk, total_obs)


In [8]:
def write_chunk(start, stop, filename):
  pd.DataFrame({
      'content': raw_datasets['train']['content'][start:stop], 
      'summary': raw_datasets['train']['summary'][start:stop], 
      'subreddit': raw_datasets['train']['subreddit'][start:stop]})\
    .to_parquet(filename)


In [9]:
%%time
#write_path ="/content/gdrive/MyDrive/Classes/W266_NLP/w266_reddit_summarization/data/reddit_parquet/"
write_path ="/content/gdrive/MyDrive/"

for i in range(len(subset_chunk)-1):
  print(f"{i+1} of {len(subset_chunk)}")
  filename = write_path + 'reddit_data_0' + str(i) + '.parquet'
  write_chunk(subset_chunk[i], subset_chunk[i+1], filename=filename)



1 of 9
2 of 9
3 of 9
4 of 9
5 of 9
6 of 9
7 of 9
8 of 9
CPU times: user 3min 35s, sys: 1min 31s, total: 5min 6s
Wall time: 6min 57s


In [8]:
%cd "/content/gdrive/MyDrive"

/content/gdrive/MyDrive


In [18]:
#Get the first parquet
import pandas as pd
df = pd.read_parquet('reddit_data_01.parquet')
str(df['content'])

"0         Two posts and someone already linked that guid...\n1         You run an email server at your house.  Total ...\n2         Another funny dmt story:  My friend and i were...\n3         I have been using the patch for five years. I ...\n4         Dude, are you people still talking about this?...\n                                ...                        \n499995    In HTML (which is the skeleton of web pages), ...\n499996    No offense, but this assumption makes you one ...\n499997    meh, yeah i'll accept that. It helps in that c...\n499998    Is it still going on? If so I would recommend ...\n499999    I'm getting pain in my left knee, on the outsi...\nName: content, Length: 500000, dtype: object"

In [54]:
#Create lists of the content and summary columns
cont_list = df['content'].tolist()
summ_list_small = df['summary'][:100].tolist()
cont_list_small = df['content'][:100].tolist()
summ_list = df['summary'].tolist()


In [2]:
# all imports
import os
import re
import tensorflow as tf
import numpy as np 
import pandas as pd
from pprint import pprint
from tqdm import tqdm

import tensorflow_datasets as tfds 
#from transformers import TFAutoModelForSequenceClassification
#from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration

In [12]:
!pip install -q transformers

In [13]:
!pip install -q sentencepiece

In [14]:
!pip install -q datasets

#Fine-tune Pegasus Model

In [14]:
## Tokenize dataset to fine-tune model (https://huggingface.co/transformers/v4.9.2/training.html)
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

In [68]:
##Is this step necessary?
#import nltk
#nltk.download('punkt')

#results = []
#for sentence in cont_list_small:
#    sentence_results = []
#    for s in sentence:
#        sentence_results.append(nltk.word_tokenize(sentence))
#    results.append(sentence_results)

In [None]:
##Pretrain and compile model

pega_model = TFPegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

pega_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

pega_model.fit(cont_list, validation_data=summ_list, epochs=3)

All model checkpoint layers were used when initializing TFPegasusForConditionalGeneration.

All the layers of TFPegasusForConditionalGeneration were initialized from the model checkpoint at google/pegasus-xsum.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFPegasusForConditionalGeneration for predictions without further training.


In [38]:
pega_model.save_pretrained("my_pega_model")

In [31]:
pega_model.summary()

Model: "tf_pegasus_for_conditional_generation_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFPegasusMainLayer)  multiple                  569748480 
                                                                 
Total params: 569,844,583
Trainable params: 569,748,480
Non-trainable params: 96,103
_________________________________________________________________


#Test Model

In [61]:
inputs = tokenizer(np.array2string(np.array(cont_list[0])), max_length=1024, truncation=True, return_tensors="tf")
inputs['input_ids'].shape

TensorShape([1, 295])

In [67]:
# Generate Summary
summary_ids = pega_model.generate(inputs["input_ids"], 
                              num_beams=4,
                              no_repeat_ngram_size=2,
                              min_length=20,
                              max_length=50)

In [66]:
print("content:")
print(cont_list[0])

print("\n\true:")
print(summ_list[0])

print("\n\nprediction:")
pprint(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0], compact=True)

content:
Two posts and someone already linked that guide? I've gotta get faster! 
 Completely agree with the above, spending a little extra goes a long way with DVD media. I've never had Taiyo Yuden (I'm in the UK and they aren't easy to get hold of) but I have had very good experiences with Verbatim and Sony. The reliability of the Sony discs versus the Infiniti discs I was using before actually paid for itself in how many burn failures I had (none in Sony's case). 
 Some notes on DVD media - there are a lot of factors at play when determining reliability or even what discs you have. Some drives don't like certain manufacturer's discs, so don't assume what works for you will work for other people, or vice-versa. The reliability on the linked guide is determined from a lot of samples. 
 The other point, which is important to bear in mind as you read the tables in that guide, is that most manufacturer's outsource manufacturing. You could buy two spindles with the same brand on the label