In [1]:
from IPython.display import clear_output

!pip install datasets transformers rouge_score nltk
!pip install pyarrow
# !pip install -q sentencepiece
# !pip install rouge-score # google package version

clear_output()

In [2]:
import os
import re
import time
from tqdm import tqdm
import pandas as pd
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt

import nltk

# pytorch dataset types
import datasets
from datasets.dataset_dict import DatasetDict
from datasets import Dataset, load_metric, load_dataset

import tensorflow as tf
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration # pegasus
from transformers import BartTokenizer, TFBartForConditionalGeneration # bart

# pytorch bart stuff
import torch
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer

Run out of memory when reading in the full dataset

In [None]:
start = time.time()

from google.colab import drive
drive.mount('/content/gdrive')
data_dir = "/content/gdrive/MyDrive/Classes/W266_NLP/w266_reddit_summarization/data/tensorflow_datasets/reddit/1.0.0"
os.chdir(data_dir)

# define train/test split from the data. but don't read the data yet.
file_names = os.listdir(data_dir)
metadata_files = [i for i in file_names if re.search('^(?!reddit-train)', i)]

data_files = np.array([i for i in file_names if re.search('^reddit-train', i)])
split_ind = np.array([int(x[22:27]) % 5 for x in data_files])
test_files = data_files[split_ind == 0]
train_files = data_files[split_ind != 0]

# split further into train/valid
valid_split = int(np.ceil(len(train_files) * .8))
valid_files = train_files[valid_split:]
train_files = train_files[:valid_split]

# downsample to speed things up
downsample = False 
if downsample:
  subset_size = .05
  train_files = train_files[:int(np.ceil(len(train_files) * subset_size))]
  test_files = test_files[:int(np.ceil(len(test_files) * subset_size))]
  valid_files = valid_files[:int(np.ceil(len(valid_files) * subset_size))]

# parse files
def parse_file(serialized_example, return_xy=False):

  file_scruct = {
      # 'author': tf.io.FixedLenFeature([], tf.string),
      # 'body': tf.io.FixedLenFeature([], tf.string),
      'content': tf.io.FixedLenFeature([], tf.string),
      # 'id': tf.io.FixedLenFeature([], tf.string),
      # 'normalizedBody': tf.io.FixedLenFeature([], tf.string),
      'subreddit': tf.io.FixedLenFeature([], tf.string),
      # 'subreddit_id': tf.io.FixedLenFeature([], tf.string),
      'summary': tf.io.FixedLenFeature([], tf.string),
  }

  example_parsed = tf.io.parse_single_example(serialized_example, file_scruct)
  return example_parsed

# parse
train_tf = tf.data.TFRecordDataset(train_files).map(parse_file)
valid_tf = tf.data.TFRecordDataset(valid_files).map(parse_file)
test_tf = tf.data.TFRecordDataset(test_files).map(parse_file)

decode_string = np.vectorize(lambda x: x.decode('utf-8'))

# load data into memory into dictionary. 
# figure out how to bypass this
## we're doing tf -> np -> pt. Want to go tf -> pt
def tf_to_dict(tf_item):
  dataset_dict = {'content': [], 'summary': [], 'subreddit': []}
  for item in tqdm(tf_item): 
    dataset_dict['content'].append(str(decode_string(item['content'].numpy())))
    dataset_dict['summary'].append(str(decode_string(item['summary'].numpy())))
    dataset_dict['subreddit'].append(str(decode_string(item['subreddit'].numpy())))
  return dataset_dict
# def tf_to_dict(tf_item):
#   dataset_dict = {'content': [], 'summary': [], 'subreddit': []}
#   for item in tf_item: 
#     dataset_dict['content'].append(item['content'])
#     dataset_dict['summary'].append(item['summary'])
#     dataset_dict['subreddit'].append(item['subreddit'])
#   return dataset_dict

# convert to pytorch Dataset datatype
all_data = DatasetDict({
    'train': Dataset.from_dict(tf_to_dict(train_tf)), 
    'valid': Dataset.from_dict(tf_to_dict(valid_tf)), 
    'test': Dataset.from_dict(tf_to_dict(test_tf))})

seconds_elapsed = time.time() - start
print(f"{seconds_elapsed/60} minutes elapsed")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


2465335it [25:10, 1632.67it/s]


Re-download because above crashes

In [None]:
start = time.time()
from datasets import Dataset, load_metric, load_dataset
raw_datasets = load_dataset("reddit")
print(f"{(time.time() - start)/60} minutes elapsed")

In [39]:
raw_datasets['train']['body'][1]

'Art is about the hardest thing to categorize in terms of good and bad. To consider one work or artist as dominate over another comes down to personal opinion. Sure some things maybe blatantly better than other works, but it ultimately lies with the individual. I personally enjoy the work of "street artists" (using quotations not to be sarcastic, but mainly because this is in a different category than graffiti and since my background is not in art I don\'t know what the "proper" term is , if there is one), but I do see where you are coming from. CLET tends to use the same images continuously (to a point where one could say "Is this it?") as do most street artists (I do think this term is thrown around a lot more than it should be, I agree with you there) and it can be annoying.\n\ntl;dr: Personal opinions \'n shit.'

In [40]:
raw_datasets['train']['content'][1]

'Art is about the hardest thing to categorize in terms of good and bad. To consider one work or artist as dominate over another comes down to personal opinion. Sure some things maybe blatantly better than other works, but it ultimately lies with the individual. I personally enjoy the work of "street artists" (using quotations not to be sarcastic, but mainly because this is in a different category than graffiti and since my background is not in art I don\'t know what the "proper" term is , if there is one), but I do see where you are coming from. CLET tends to use the same images continuously (to a point where one could say "Is this it?") as do most street artists (I do think this term is thrown around a lot more than it should be, I agree with you there) and it can be annoying.'

In [41]:
raw_datasets['train']['subreddit'][1]

'funny'

In [42]:
raw_datasets['train']['summary'][1]

"Personal opinions 'n shit."

In [44]:
pd_data = pd.DataFrame({
    'content': raw_datasets['train']['content'], 
    'summary': raw_datasets['train']['summary'], 
    'subreddit': raw_datasets['train']['subreddit']
})

In [45]:
pd_data

Unnamed: 0,content,summary,subreddit
0,I think it should be fixed on either UTC stand...,Shifting seasonal time is no longer worth it.,math
1,Art is about the hardest thing to categorize i...,Personal opinions 'n shit.,funny
2,Ask me what I think about the Wall Street Jour...,insults and slack ass insight. \n Wall Street ...,Borderlands
3,"In Mechwarrior Online, I have begun to use a m...","Yes, Joysticks in modern games have apparently...",gamingpc
4,"You are talking about the Charsi imbue, right?...",Class only items dropped from high-lvl monsters.,Diablo
...,...,...,...
3848325,I've finally gotten around to initiating plans...,"hate my own feet, and don't know how to give a...",sex
3848326,"Long time lurker, first time poster here. I'm ...","want to win cash prize, need answer for radio ...",AskReddit
3848327,"Long time lurker, first time poster here. I'm ...","want cash prize, need answer for radio contest...",AskReddit
3848328,My xbox hasn't been in the best of health rece...,my xbox has died only a few days before launch...,battlefield3


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
data_dir = "/content/gdrive/MyDrive/Classes/W266_NLP/w266_reddit_summarization/data/reddit_parquet/"

# convert to parquet
import pyarrow as pa
table = pa.Table.from_pandas(pd_data)

# write a single parquet file
import pyarrow.parquet as pq
pq.write_table(table, data_dir + 'reddit_data.parquet')

Mounted at /content/gdrive


In [34]:
raw_datasets['train']['subreddit'][0]
raw_datasets['train']['subreddit'][0]

'math'

In [18]:
subreddit_df = pd.DataFrame({'subreddit': raw_datasets['train']['subreddit']})
subreddit_counts = pd.DataFrame(subreddit_df.value_counts()).reset_index().rename(columns={0: 'N'})

In [30]:
def group_subreddit(subreddit):
  if subreddit in ['AskReddit', 'AskMen']:
    x = 'advice'
  elif subreddit in ['relationships', 'relationship_advice']: 
    x = 'relationships'
  elif subreddit in ['leagueoflegends', 'gaming', 'DotA2']:
    x = 'gaming'
  elif subreddit in ['tifu', 'TwoXChromosomes', 'offmychest']:
    x = 'story'
  elif subreddit in ['atheism', 'religion']:
    x = 'religion'
  elif re.search('animals', subreddit.lower()):
    x = 'animals'
  elif subreddit in ['funny']:
    x = 'comedy'
  elif subreddit in ['politics'] or re.search('news', subreddit.lower()):
    x = 'news'
  else:
    x = 'other'
  
  return x


subreddit_counts['category'] = subreddit_counts['subreddit'].map(group_subreddit)

In [31]:
from IPython.display import display, HTML

# Puts the scrollbar next to the DataFrame
display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" +
             subreddit_counts.to_html() +
             "</div>"))

Output hidden; open in https://colab.research.google.com to view.

In [26]:
if re.search('news', 'worldnews'):
  print('yes')

yes
