In [1]:
import os

if os.path.isdir("../notebooks/"):
    os.chdir("..")

In [2]:
import re
import json

import spacy
from tqdm import tqdm
import pandas as pd
import numpy as np
from collections import Counter

from badseeds import preprocess

In [3]:
# path to config json file containing paths to datasets. change if necessary
CONFIG_PATH = "./config.json"

In [4]:
with open(CONFIG_PATH, "r") as f:
    config = json.load(f)

## Preprocessing

If you already have saved proprocessed the datasets, preprocessing can be skipped, reading the preprocessed results from disk (Default). Otherwise, change the `PREPROC_NOW` flag to `True` to preprocess the data now. Note that preprocessing will take a long time.


In [9]:
PREPROC_NOW = False
# if your paths are different, change them accordingly (paths to preproc files)
pproc_data_path = os.path.join(
    config["preprocessed"]["dir_path"],
    "processed",
)
NYT_PATH = os.path.join(pproc_data_path, "nytimes_news_articles.bin")
WIKI_PATH = os.path.join(pproc_data_path, "wiki.train.tokens.bin")
GRR_PATH = os.path.join(pproc_data_path, "romance")
GRHB_PATH = os.path.join(pproc_data_path, "history_biography")
# preprocess if requested. This will save results to disk.
if PREPROC_NOW:
    preprocess.preprocess_datasets()
# read preprocessed results from disk
pproc_data = preprocess.read_pproc_datasets(NYT_PATH, WIKI_PATH, GRR_PATH, GRHB_PATH)

reading nyt
reading wikitext
reading goodreads romance
Directory detected, reading and concatenating all containing files


100%|█████████████████████████████████████████████| 8/8 [00:01<00:00,  6.06it/s]


reading goodreads history/biography
Directory detected, reading and concatenating all containing files


100%|█████████████████████████████████████████████| 5/5 [00:00<00:00,  5.05it/s]


## Table 2

### Prepare

In [10]:
table_2 = pd.DataFrame(
    index=pd.Series(
        ["NYT", "WikiText", "Goodreads (Romance)", "Goodreads (History/Biography)"],
        name="Dataset",
    ),
    columns=[
        "Total Documents_theirs",
        "Total Documents_ours",
        "Total Words_theirs",
        "Total Words_ours",
        "Vocabulary Size_theirs",
        "Vocabulary Size_ours",
        "Mean Document Length_theirs",
        "Mean Document Length_ours",
    ],
)
table_2.columns = table_2.columns.str.split("_", expand=True)
table_2[("Total Documents", "theirs")] = [8888, 28472, 197000, 136000]
table_2[("Total Words", "theirs")] = [7244457, 99197146, 24856924, 14324947]
table_2[("Vocabulary Size", "theirs")] = [162998, 546828, 214572, 163171]
table_2[("Mean Document Length", "theirs")] = [815, 3484, 126, 105]

In [11]:
table_2

Unnamed: 0_level_0,Total Documents,Total Documents,Total Words,Total Words,Vocabulary Size,Vocabulary Size,Mean Document Length,Mean Document Length
Unnamed: 0_level_1,theirs,ours,theirs,ours,theirs,ours,theirs,ours
Dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
NYT,8888,,7244457,,162998,,815,
WikiText,28472,,99197146,,546828,,3484,
Goodreads (Romance),197000,,24856924,,214572,,126,
Goodreads (History/Biography),136000,,14324947,,163171,,105,


### Helper Class

In [12]:
class StatGetter:
    def __init__(self, docbin):
        self.counter = Counter()
        self.docbin = docbin
        self.length = 0

    def get_total_documents(self):
        print("Computing total docs")
        self.length = sum(1 for _ in preprocess.docbin_to_docs(self.docbin))
        return self.length

    def count_tokens(self):
        print("Counting word occurrences")
        for doc in tqdm(preprocess.docbin_to_docs(self.docbin), total=self.length):
            for token in doc:
                # only keep alphanumeric tokens
                token = re.sub(r"[^\w\s]", "", token.text)
                if token != "":
                    self.counter[token] += 1

    def filter_counter(self):
        print("Only keeping words that appear at least 10 times")
        # only keep words that appear at least 10 times
        self.counter = {x: count for x, count in self.counter.items() if count >= 10}

    def count_and_filter(self):
        self.count_tokens()
        self.filter_counter()

    def get_total_words(self):
        print("Computing total words")
        if len(self.counter) == 0:
            self.count_and_filter()
        return sum(self.counter.values())

    def get_vocab_size(self, filter_flag=False):
        print("Computing vocab size")
        if filter_flag:
            if len(self.counter) == 0:
                self.count_and_filter()
            return len(self.counter.keys())
        else:
            vocab = set()
            for doc in tqdm(preprocess.docbin_to_docs(self.docbin), total=self.length):
                for token in doc:
                    vocab.add(token.text)
            return len(vocab)

    def get_mean_document_length(self):
        print("Computing mean document length")
        if len(self.counter) == 0:
            self.count_and_filter()
        lengths = []
        for doc in tqdm(preprocess.docbin_to_docs(self.docbin), total=self.length):
            length = 0
            for token in doc:
                if self.counter.get(re.sub(r"[^\w\s]", "", token.text), 0) > 0:
                    length += 1
            lengths.append(length)
        return np.mean(lengths)

### Stats Computation

This will take a while (~15 mins)! Run the cells below and go get a coffee or something.

In [13]:
amount_documents = []
word_counts = []
vocab_counts = []
document_lengths = []

In [14]:
# iterate dataset by dataset
for key, docbin in pproc_data.items():
    print(f"Getting statistics for {key}")
    # init StatGetter object
    stats = StatGetter(docbin)
    # record and compute stats for this particular dataset
    amount_documents.append(stats.get_total_documents())
    word_counts.append(stats.get_total_words())
    vocab_counts.append(stats.get_vocab_size())
    document_lengths.append(stats.get_mean_document_length())
    print("\n")

Getting statistics for NYT
computing total docs
computing total words
counting word occurrences


100%|██████████████████████████████████████| 8888/8888 [00:11<00:00, 801.12it/s]


only keeping words that appear at least 10 times
computing vocab size


100%|█████████████████████████████████████| 8888/8888 [00:05<00:00, 1732.18it/s]


computing mean document length


100%|██████████████████████████████████████| 8888/8888 [00:10<00:00, 877.93it/s]




Getting statistics for WikiText
computing total docs
computing total words
counting word occurrences


100%|████████████████████████████████████| 28472/28472 [01:56<00:00, 244.63it/s]


only keeping words that appear at least 10 times
computing vocab size


100%|████████████████████████████████████| 28472/28472 [00:50<00:00, 562.49it/s]


computing mean document length


100%|████████████████████████████████████| 28472/28472 [01:45<00:00, 270.92it/s]




Getting statistics for Goodreads (Romance)
computing total docs
computing total words
counting word occurrences


100%|█████████████████████████████████| 194500/194500 [00:55<00:00, 3512.70it/s]


only keeping words that appear at least 10 times
computing vocab size


100%|█████████████████████████████████| 194500/194500 [00:37<00:00, 5237.39it/s]


computing mean document length


100%|█████████████████████████████████| 194500/194500 [01:00<00:00, 3214.15it/s]




Getting statistics for Goodreads (History/Biography)
computing total docs
computing total words
counting word occurrences


100%|█████████████████████████████████| 135000/135000 [00:38<00:00, 3495.87it/s]


only keeping words that appear at least 10 times
computing vocab size


100%|█████████████████████████████████| 135000/135000 [00:25<00:00, 5348.57it/s]


computing mean document length


100%|█████████████████████████████████| 135000/135000 [00:36<00:00, 3719.03it/s]








### Insert data in table 2

In [15]:
table_2[("Total Documents", "ours")] = amount_documents
table_2[("Total Words", "ours")] = word_counts
table_2[("Vocabulary Size", "ours")] = vocab_counts
table_2[("Mean Document Length", "ours")] = document_lengths

### Format and show table 2

In [16]:
# to get into latex, copy into https://www.tablesgenerator.com/latex_tables
table_2.round().astype(int).style.format("{:,d}")

Unnamed: 0_level_0,Total Documents,Total Documents,Total Words,Total Words,Vocabulary Size,Vocabulary Size,Mean Document Length,Mean Document Length
Unnamed: 0_level_1,theirs,ours,theirs,ours,theirs,ours,theirs,ours
Dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
NYT,8888,8888,7244457,7217851,162998,109713,815,812
WikiText,28472,28472,99197146,87077718,546828,228318,3484,3058
Goodreads (Romance),197000,194500,24856924,24695141,214572,249114,126,127
Goodreads (History/Biography),136000,135000,14324947,14168742,163171,193012,105,105
