# Steps to save data as parquet

First install dependencies

In [1]:
from IPython.display import clear_output

!pip install datasets transformers rouge_score nltk
!pip install pyarrow
# !pip install -q sentencepiece
# !pip install rouge-score # google package version

clear_output()

In [2]:
import os
import re
import time
from tqdm import tqdm
import pandas as pd
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt

# nlp stuff
import nltk

# tf stuff
import tensorflow_datasets as tfds 
import tensorflow as tf
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration # pegasus
from transformers import BartTokenizer, TFBartForConditionalGeneration # bart

# pytorch dataset types
import datasets
from datasets.dataset_dict import DatasetDict
from datasets import Dataset, load_metric, load_dataset

# pytorch bart stuff
import torch
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer

Mount gdrive (customize your save path)

In [7]:
from google.colab import drive
drive.mount('/content/gdrive')
write_path ="/content/gdrive/MyDrive/w266_reddit_summarization/data/reddit_parquet/"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Download data

In [4]:
start = time.time()
raw_datasets = load_dataset("reddit")
print(f"{(time.time() - start)/60} minutes elapsed")

Downloading builder script:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset reddit/default (download: 2.93 GiB, generated: 17.64 GiB, post-processed: Unknown size, total: 20.57 GiB) to /root/.cache/huggingface/datasets/reddit/default/1.0.0/98ba5abea674d3178f7588aa6518a5510dc0c6fa8176d9653a3546d5afcb3969...


Downloading data:   0%|          | 0.00/3.14G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3848330 [00:00<?, ? examples/s]

Dataset reddit downloaded and prepared to /root/.cache/huggingface/datasets/reddit/default/1.0.0/98ba5abea674d3178f7588aa6518a5510dc0c6fa8176d9653a3546d5afcb3969. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

15.107490309079488 minutes elapsed


Slice data into chunks and write each piece. Had to split this up otherwise it explodes my memory when trying to write.

In [5]:
# slice it up and save it in chunks
# doing 500k chunks
total_obs = len(raw_datasets['train']['subreddit'])
subset_chunk = np.arange(0, total_obs, step=500000)
subset_chunk = np.append(subset_chunk, total_obs)

In [6]:
def write_chunk(start, stop, filename):
  pd.DataFrame({
      'content': raw_datasets['train']['content'][start:stop], 
      'summary': raw_datasets['train']['summary'][start:stop], 
      'subreddit': raw_datasets['train']['subreddit'][start:stop]})\
    .to_parquet(filename)

In [8]:
%%time

for i in range(len(subset_chunk)-1):
  print(f"{i+1} of {len(subset_chunk)}")
  filename = write_path + 'reddit_data_0' + str(i) + '.parquet'
  write_chunk(subset_chunk[i], subset_chunk[i+1], filename=filename)

1 of 9
2 of 9
3 of 9
4 of 9
5 of 9
6 of 9
7 of 9
8 of 9
CPU times: user 2min 45s, sys: 44.1 s, total: 3min 29s
Wall time: 3min 48s


Now parquet files are in that path