In [7]:
# all imports
import tensorflow_datasets as tfds 
import os
import re
import tensorflow as tf
import numpy as np 
import pandas as pd

In [2]:
# download the dataset
# https://github.com/huggingface/datasets/tree/main/datasets/reddit
data = tfds.load(name='reddit')

# about:
# takes my computer ~45 minutes to download
# no train/test/val splits predetermined. User must split it up
# 3.8M obs, posts
# average length of 270 words for content, and 28 words for the summary.

[1mDownloading and preparing dataset reddit/1.0.0 (download: 2.93 GiB, generated: 18.09 GiB, total: 21.01 GiB) to /root/tensorflow_datasets/reddit/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]






0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/reddit/1.0.0.incompleteEPXMV3/reddit-train.tfrecord


  0%|          | 0/3848330 [00:00<?, ? examples/s]

[1mDataset reddit downloaded and prepared to /root/tensorflow_datasets/reddit/1.0.0. Subsequent calls will reuse this data.[0m


In [15]:
# so now data is in here
# the data sits on disk, and not memory
path = '/root/tensorflow_datasets/reddit/1.0.0/'
os.chdir(path)

file_names = os.listdir(path)
metadata_files = [i for i in file_names if re.search('^(?!reddit-train)', i)]
train_records = [i for i in file_names if re.search('^reddit-train', i)]

# create small subset for inspecting
train_records_small = train_records[:10]

In [17]:
# I found the FeaturesDict() structure from the tensorflow page for this dataset: https://www.tensorflow.org/datasets/catalog/reddit. 
# this guide talks about reading it in: https://www.tensorflow.org/datasets/external_tfrecord. 
# but I never ended up needing this exactly. 
features = tfds.features.FeaturesDict({
    # these are the keys to each datafile
    'author': tf.string,
    'body': tf.string,
    'content': tf.string,
    'id': tf.string,
    'normalizedBody': tf.string,
    'subreddit': tf.string,
    'subreddit_id': tf.string,
    'summary': tf.string,
})

# Reading in the data in several ways

In [19]:
# read in a raw records.
raw_dataset = tf.data.TFRecordDataset(train_records_small)
for raw_record in raw_dataset.take(10):
  print(repr(raw_record))

<tf.Tensor: shape=(), dtype=string, numpy=b'\n\x99]\n\x17\n\x06author\x12\r\n\x0b\n\tmetnavman\n\xa8\x1a\n\x07content\x12\x9c\x1a\n\x99\x1a\n\x96\x1aGreetings all! I\'m going to tell you about Raidkraft! Raidkraft is a Tekkit server that was fired up earlier this year by the folks who run the Raidkraft "vanilla minecraft" server. This 100-slot, UK-based server is run by a group of very serious and dedicated individuals who strive to make your tekkit experience an awesome one! Now then, let\'s get down to the nitty-gritty! \n The Awesome stuff \n \n Mutli-world Towny server. \n Worlds are divided into a Mining, Nether, and Town world. \n Mining/Nether world is completely FFA PvP/raiding/griefing(couple rules, but not many). \n Town world is completely protected and designed to let players build in a plot-system. If you\'ve played with Towny, you\'ll get the idea. If not, swing by, and we\'ll teach you! \n Multiple towns support a growing player economy, and majorly expanded trade and ad

In [23]:
# Parsing it into a json format
raw_dataset = tf.data.TFRecordDataset(train_records_small)

# limit 2 examples
for i, raw_record in enumerate(raw_dataset.take(1)):
  example = tf.train.Example()
  example.ParseFromString(raw_record.numpy())
  print(example)
  if i > 1:
    break

features {
  feature {
    key: "author"
    value {
      bytes_list {
        value: "metnavman"
      }
    }
  }
  feature {
    key: "body"
    value {
      bytes_list {
        value: "Greetings all! I\'m going to tell you about Raidkraft! Raidkraft is a Tekkit server that was fired up earlier this year by the folks who run the Raidkraft \"vanilla minecraft\" server. This 100-slot, UK-based server is run by a group of very serious and dedicated individuals who strive to make your tekkit experience an awesome one! Now then, let\'s get down to the nitty-gritty!\n\n\n**The Awesome stuff**\n\n* Mutli-world Towny server.\n* Worlds are divided into a Mining, Nether, and Town world.\n* Mining/Nether world is completely FFA PvP/raiding/griefing(couple rules, but not many).\n* Town world is completely protected and designed to let players build in a plot-system. If you\'ve played with Towny, you\'ll get the idea. If not, swing by, and we\'ll teach you!\n* Multiple towns support a growing

In [25]:
# parse the output by defining the var types
# this seems like the best way so far.
def parse_file(serialized_example):

  file_scruct = {
      'author': tf.io.FixedLenFeature([], tf.string),
      'body': tf.io.FixedLenFeature([], tf.string),
      'content': tf.io.FixedLenFeature([], tf.string),
      'id': tf.io.FixedLenFeature([], tf.string),
      'normalizedBody': tf.io.FixedLenFeature([], tf.string),
      'subreddit': tf.io.FixedLenFeature([], tf.string),
      'subreddit_id': tf.io.FixedLenFeature([], tf.string),
      'summary': tf.io.FixedLenFeature([], tf.string),
  }

  example1 = tf.io.parse_single_example(serialized_example, file_scruct)
  return example1

# parse:
dataset = tf.data.TFRecordDataset(train_records_small).map(parse_file)


# print it out
# this prints out too many
# for item in dataset:
#   print(item['author'], item['summary'], item['subreddit'])

# just print a few
for i, item in enumerate(dataset):
  tup1 = (item['author'], item['body'], item['content'], item['summary'], item['subreddit'])
  print(tup1)
  if i >= 10:
    break

(<tf.Tensor: shape=(), dtype=string, numpy=b'metnavman'>, <tf.Tensor: shape=(), dtype=string, numpy=b'Greetings all! I\'m going to tell you about Raidkraft! Raidkraft is a Tekkit server that was fired up earlier this year by the folks who run the Raidkraft "vanilla minecraft" server. This 100-slot, UK-based server is run by a group of very serious and dedicated individuals who strive to make your tekkit experience an awesome one! Now then, let\'s get down to the nitty-gritty!\n\n\n**The Awesome stuff**\n\n* Mutli-world Towny server.\n* Worlds are divided into a Mining, Nether, and Town world.\n* Mining/Nether world is completely FFA PvP/raiding/griefing(couple rules, but not many).\n* Town world is completely protected and designed to let players build in a plot-system. If you\'ve played with Towny, you\'ll get the idea. If not, swing by, and we\'ll teach you!\n* Multiple towns support a growing player economy, and majorly expanded trade and admin shops have many things to offer growin

In [27]:
# another way to read it in. Seems messy
dataset = tf.data.TFRecordDataset(train_records_small)
output = next(iter(dataset.batch(30000)))
output[1]

<tf.Tensor: shape=(), dtype=string, numpy=b"\n\x8a \n\x11\n\x02id\x12\x0b\n\t\n\x07c3r23yx\n\xba\n\n\x0enormalizedBody\x12\xa7\n\n\xa4\n\n\xa1\nUnless you're investing in multiple top of the line gpu's I'd say buy now. Might just be me personally. but there is always a new line of something coming out that you could hold out to buy, but by then another part of your build is being replaced and its just not worth it. Enjoy your pc now, and if you really care that much about potential price drops, buy it over the course of a couple weeks hitting up every really good online deals you can. \n Granted, I am excited for the ivy bridge line of processors to be released this year, the accompanying but still useless/excessive speeds of pci-e 3.0, and nvidias probably over hyped but undoubtedly awesome gtx 600 series. \n Alright so I just bought my components over the last week and a half planning to have mine together by the weekend and may have convinced myself this wasn't the absolute right wa

In [28]:
# trying to get all the subreddits
dataset = tf.data.TFRecordDataset(train_records_small).map(parse_file)
subreddits = []
for item in dataset: 
  subreddits.append(item['subreddit'])

In [29]:
print('Number of items {}'.format(len(subreddits)))
subreddits[:10]

Number of items 150325


[<tf.Tensor: shape=(), dtype=string, numpy=b'tekkitservers'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'buildapc'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'FancyFollicles'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'wow'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'ableton'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Fitness'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'AskReddit'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'AskReddit'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'AskReddit'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Entrepreneur'>]

In [36]:
x = tf.unique_with_counts(subreddits, out_idx=tf.dtypes.int32, name='subreddit_distinct_counts')
decode_string = np.vectorize(lambda x: x.decode('utf-8'))

subreddit_df = pd.DataFrame({
    "subreddit": decode_string(x[0].numpy()),
    "freq": x[2].numpy()
})
subreddit_df.sort_values('freq', ascending=False)

Unnamed: 0,subreddit,freq
6,AskReddit,22822
18,relationships,13719
24,leagueoflegends,4300
17,tifu,2074
66,relationship_advice,1995
...,...,...
4556,ericprydz,1
4557,ios7,1
4558,justvent,1
4559,nobite,1
