In [1]:
from IPython.display import clear_output

!pip install datasets transformers rouge_score nltk
!pip install pyarrow
# !pip install -q sentencepiece
# !pip install rouge-score # google package version

clear_output()

In [2]:
import os
import re
import time
from tqdm import tqdm
import pandas as pd
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt

# nlp stuff
import nltk

# tf stuff
import tensorflow_datasets as tfds 
import tensorflow as tf
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration # pegasus
from transformers import BartTokenizer, TFBartForConditionalGeneration # bart

# pytorch dataset types
import datasets
from datasets.dataset_dict import DatasetDict
from datasets import Dataset, load_metric, load_dataset

# pytorch bart stuff
import torch
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer

In [None]:
data = tfds.load(name='reddit')

[1mDownloading and preparing dataset reddit/1.0.0 (download: 2.93 GiB, generated: 18.09 GiB, total: 21.01 GiB) to /root/tensorflow_datasets/reddit/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]






0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/reddit/1.0.0.incompleteP7KVZC/reddit-train.tfrecord


  0%|          | 0/3848330 [00:00<?, ? examples/s]

[1mDataset reddit downloaded and prepared to /root/tensorflow_datasets/reddit/1.0.0. Subsequent calls will reuse this data.[0m


In [7]:
path = '/root/tensorflow_datasets/reddit/1.0.0/'
# trying to save the data to my actual google drive folder like this:
import os
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!cp -r /root/tensorflow_datasets /content/gdrive/MyDrive/Classes/W266_NLP/w266_reddit_summarization/data/

In [None]:
# if you want to re download the data each time it'll sit on a temporary disk storage that'll last 12 hours on colab free.
path = '/root/tensorflow_datasets/reddit/1.0.0/'
os.chdir(path)

file_names = os.listdir(path)
metadata_files = [i for i in file_names if re.search('^(?!reddit-train)', i)]
train_records = [i for i in file_names if re.search('^reddit-train', i)]

# define train/test split
split_ind = np.array([int(x[22:27]) % 5 for x in data_files])
test = data_files[split_ind == 0]
train = data_files[split_ind != 0]

# Download Pytorch version

In [None]:
# run once every 12 hours.
# takes about ~20 min to download
start = time.time()

raw_datasets = load_dataset("reddit")
print(f"{(time.time() - start)/60} minutes elapsed")

In [None]:
!ls /root/.cache/huggingface/datasets/reddit/default/1.0.0/98ba5abea674d3178f7588aa6518a5510dc0c6fa8176d9653a3546d5afcb3969/
# cache-67e8a944275bfaa7.arrow  dataset_info.json
# cache-79ae0f34bc57c861.arrow  reddit-train.arrow

# Download and save as parquet

In [3]:
start = time.time()
raw_datasets = load_dataset("reddit")
print(f"{(time.time() - start)/60} minutes elapsed")

Downloading builder script:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset reddit/default (download: 2.93 GiB, generated: 17.64 GiB, post-processed: Unknown size, total: 20.57 GiB) to /root/.cache/huggingface/datasets/reddit/default/1.0.0/98ba5abea674d3178f7588aa6518a5510dc0c6fa8176d9653a3546d5afcb3969...


Downloading data:   0%|          | 0.00/3.14G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3848330 [00:00<?, ? examples/s]

Dataset reddit downloaded and prepared to /root/.cache/huggingface/datasets/reddit/default/1.0.0/98ba5abea674d3178f7588aa6518a5510dc0c6fa8176d9653a3546d5afcb3969. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

20.439121941725414 minutes elapsed


In [10]:
# slice it up and save it in chunks
# doing 500k chunks
total_obs = len(raw_datasets['train']['subreddit'])
subset_chunk = np.arange(0, total_obs, step=500000)
subset_chunk = np.append(subset_chunk, total_obs)

In [17]:
from google.colab import drive
drive.mount('/content/gdrive')
write_path ="/content/gdrive/MyDrive/Classes/W266_NLP/w266_reddit_summarization/data/reddit_parquet/"

Mounted at /content/gdrive


In [20]:
def write_chunk(start, stop, filename):
  pd.DataFrame({
      'content': raw_datasets['train']['content'][start:stop], 
      'summary': raw_datasets['train']['summary'][start:stop], 
      'subreddit': raw_datasets['train']['subreddit'][start:stop]})\
    .to_parquet(filename)

In [21]:
%%time
write_path ="/content/gdrive/MyDrive/Classes/W266_NLP/w266_reddit_summarization/data/reddit_parquet/"

for i in range(len(subset_chunk)-1):
  print(f"{i+1} of {len(subset_chunk)}")
  filename = write_path + 'reddit_data_0' + str(i) + '.parquet'
  write_chunk(subset_chunk[i], subset_chunk[i+1], filename=filename)

1 of 9
2 of 9
3 of 9
4 of 9
5 of 9
6 of 9
7 of 9
8 of 9
CPU times: user 3min 20s, sys: 1min 22s, total: 4min 43s
Wall time: 7min 23s


In [16]:
%%time
# convert to pandas
pd.DataFrame({
    'content': raw_datasets['train']['content'][subset_chunk[0]:subset_chunk[1]], 
    'summary': raw_datasets['train']['summary'][subset_chunk[0]:subset_chunk[1]], 
    'subreddit': raw_datasets['train']['subreddit'][subset_chunk[0]:subset_chunk[1]]})\
    .to_parquet(write_path + 'reddit_data_01.parquet')

CPU times: user 18.5 s, sys: 9.07 s, total: 27.5 s
Wall time: 46.2 s


In [18]:
%%time
# write chunk as parquet
pd_data.to_parquet(write_path + 'reddit_data_01.parquet')

CPU times: user 3.98 s, sys: 1.13 s, total: 5.12 s
Wall time: 7.15 s


In [19]:
!ls /content/gdrive/MyDrive/Classes/W266_NLP/w266_reddit_summarization/data/reddit_parquet/

reddit_data_01.parquet


In [5]:
# define train/test split
pd_data = pd_data.sample(frac=1, random_state=1).reset_index(drop=True)
split_point = int(pd_data.shape[0] * .8)
# train = pd_data[:split_point]
# test = pd_data[split_point:]

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')
write_path ="/content/gdrive/MyDrive/Classes/W266_NLP/w266_reddit_summarization/data/reddit_parquet/"

Mounted at /content/gdrive


In [None]:
%%time
# write train as parquet
pd_data[:split_point].to_parquet(write_path + 'reddit_train.parquet')

In [None]:
%%time
# write test as parquet
pd_data[split_point:].to_parquet(write_path + 'reddit_test.parquet')