In [1]:
import os
if os.path.isdir('../notebooks/'): os.chdir('..')

In [2]:
import re

import spacy
from tqdm.auto import tqdm
import pandas as pd
import numpy as np

from badseeds import preprocess

## Preprocessing

If you already have saved proprocessed the datasets, preprocessing can be skipped, reading the preprocessed results from disk (Default). Otherwise, change the `PREPROC_NOW` flag to `True` to preprocess the data now. This will take a long time.


In [6]:
PREPROC_NOW = False
# if your paths are different, change them accordingly (paths to preproc files)
NYT_PATH = "data/processed/nytimes_news_articles.bin"
WIKI_PATH = "data/processed/wiki.train.tokens.bin"
GRR_PATH = "data/processed/romance"
GRHB_PATH = "data/processed/history_biography"
# preprocess if requested. This will save results to disk.
if PREPROC_NOW:
    preprocess.preprocess_datasets()
# read preprocessed results from disk
pproc_data = preprocess.read_preprocessed_datasets(
    NYT_PATH, WIKI_PATH, GRR_PATH, GRHB_PATH
)

reading nyt
reading wikitext
reading goodreads romance


100%|██████████████████████| 8/8 [00:50<00:00,  6.30s/it]


reading goodreads history/biography


100%|██████████████████████| 5/5 [00:53<00:00, 10.79s/it]


## Table 2

### Prepare

In [3]:
table_2 = pd.DataFrame(
    index=pd.Series(
        ["NYT", "WikiText", "Goodreads (Romance)", "Goodreads (History/Biography)"],
        name="Dataset",
    ),
    columns=[
        r"Total Documents_ours",
        "Total Documents_theirs",
        "Total Words_ours",
        "Total Words_theirs",
        "Vocabulary Size_ours",
        "Vocabulary Size_theirs",
        "Mean Document Length_ours",
        "Mean Document Length_theirs",
    ],
)
table_2.columns = table_2.columns.str.split('_', expand=True)
table_2[("Total Documents", "theirs")] = [8888, 28472, 197000, 136000]
table_2[("Total Words", "theirs")] = [7244457, 99197146, 24856924, 14324947]
table_2[("Vocabulary Size", "theirs")] = [162998, 546828, 214572, 163171]
table_2[("Mean Document Length", "theirs")] = [815 , 3484, 126, 105]

In [4]:
table_2

Unnamed: 0_level_0,Total Documents,Total Documents,Total Words,Total Words,Vocabulary Size,Vocabulary Size,Mean Document Length,Mean Document Length
Unnamed: 0_level_1,ours,theirs,ours,theirs,ours,theirs,ours,theirs
Dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
NYT,,8888,,7244457,,162998,,815
WikiText,,28472,,99197146,,546828,,3484
Goodreads (Romance),,197000,,24856924,,214572,,126
Goodreads (History/Biography),,136000,,14324947,,163171,,105


### Total Documents

In [18]:
table_2[("Total Documents", "ours")] = [len(docs) for _k, docs in pproc_data.items()]

In [19]:
table_2

Unnamed: 0_level_0,Total Documents,Total Documents,Total Words,Total Words,Vocabulary Size,Vocabulary Size,Mean Document Length,Mean Document Length
Unnamed: 0_level_1,ours,theirs,ours,theirs,ours,theirs,ours,theirs
Dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
NYT,8888,8888,,7244457,,162998,,815
WikiText,28472,28472,,99197146,,546828,,3484
Goodreads (Romance),194500,197000,,24856924,,214572,,126
Goodreads (History/Biography),135000,136000,,14324947,,163171,,105


### Total Words

In [20]:
# just every token
table_2[("Total Words", "ours")] = [
    np.sum([len(doc) for doc in tqdm(docs)]) for _k, docs in pproc_data.items()
]

100%|███████████| 8888/8888 [00:00<00:00, 1076431.45it/s]
100%|█████████| 28472/28472 [00:00<00:00, 1168415.31it/s]
100%|███████| 194500/194500 [00:00<00:00, 1590935.85it/s]
100%|███████| 135000/135000 [00:00<00:00, 2343749.13it/s]


In [21]:
table_2

Unnamed: 0_level_0,Total Documents,Total Documents,Total Words,Total Words,Vocabulary Size,Vocabulary Size,Mean Document Length,Mean Document Length
Unnamed: 0_level_1,ours,theirs,ours,theirs,ours,theirs,ours,theirs
Dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
NYT,8888,8888,8682929,7244457,,162998,,815
WikiText,28472,28472,103627993,99197146,,546828,,3484
Goodreads (Romance),194500,197000,28537205,24856924,,214572,,126
Goodreads (History/Biography),135000,136000,16381558,14324947,,163171,,105


In [None]:
# # how many alphanumeric tokens
# np.sum(
#     [
#         np.sum(doc.to_array(["IS_ALPHA", "IS_DIGIT"]).any(axis=1))
#         for doc in tqdm(wiki_docs)
#     ]
# )

### Vocabulary Size

In [75]:
vocab_sizes = []
for key, docs in pproc_data.items():
    print(key)
    vocab = set()
    for doc in tqdm(docs):
        for token in doc:
            vocab.add(token.text)
    vocab_sizes.append(len(vocab))

NYT


100%|██████████████| 8888/8888 [00:05<00:00, 1739.28it/s]


WikiText


100%|█████████████| 28472/28472 [02:01<00:00, 235.17it/s]


Goodreads (Romance)


100%|██████████| 194500/194500 [00:40<00:00, 4791.37it/s]


Goodreads (History/Biography)


100%|██████████| 135000/135000 [00:21<00:00, 6175.09it/s]


In [76]:
table_2[("Vocabulary Size", "ours")] = vocab_sizes

In [77]:
table_2

Unnamed: 0_level_0,Total Documents,Total Documents,Total Words,Total Words,Vocabulary Size,Vocabulary Size,Mean Document Length,Mean Document Length
Unnamed: 0_level_1,ours,theirs,ours,theirs,ours,theirs,ours,theirs
Dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
NYT,8888,8888,8682929,7244457,109713,162998,,815
WikiText,28472,28472,103627993,99197146,228318,546828,,3484
Goodreads (Romance),194500,197000,28537205,24856924,249114,214572,,126
Goodreads (History/Biography),135000,136000,16381558,14324947,193012,163171,,105


### Mean Document Length

In [78]:
table_2[("Mean Document Length", "ours")] = [
    np.mean([len(doc) for doc in tqdm(docs)]) for _k, docs in pproc_data.items()
]

100%|████████████| 8888/8888 [00:00<00:00, 276677.51it/s]
100%|████████████| 28472/28472 [00:03<00:00, 7641.48it/s]
100%|█████████| 194500/194500 [00:02<00:00, 79583.73it/s]
100%|███████| 135000/135000 [00:00<00:00, 1683187.60it/s]


In [79]:
table_2

Unnamed: 0_level_0,Total Documents,Total Documents,Total Words,Total Words,Vocabulary Size,Vocabulary Size,Mean Document Length,Mean Document Length
Unnamed: 0_level_1,ours,theirs,ours,theirs,ours,theirs,ours,theirs
Dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
NYT,8888,8888,8682929,7244457,109713,162998,976.927205,815
WikiText,28472,28472,103627993,99197146,228318,546828,3639.645722,3484
Goodreads (Romance),194500,197000,28537205,24856924,249114,214572,146.720848,126
Goodreads (History/Biography),135000,136000,16381558,14324947,193012,163171,121.344874,105


In [80]:
table_2.to_latex()

'\\begin{tabular}{lrrrrrrrr}\n\\toprule\n{} & \\multicolumn{2}{l}{Total Documents} & \\multicolumn{2}{l}{Total Words} & \\multicolumn{2}{l}{Vocabulary Size} & \\multicolumn{2}{l}{Mean Document Length} \\\\\n{} &            ours &  theirs &        ours &    theirs &            ours &  theirs &                 ours & theirs \\\\\nDataset                       &                 &         &             &           &                 &         &                      &        \\\\\n\\midrule\nNYT                           &            8888 &    8888 &     8682929 &   7244457 &          109713 &  162998 &           976.927205 &    815 \\\\\nWikiText                      &           28472 &   28472 &   103627993 &  99197146 &          228318 &  546828 &          3639.645722 &   3484 \\\\\nGoodreads (Romance)           &          194500 &  197000 &    28537205 &  24856924 &          249114 &  214572 &           146.720848 &    126 \\\\\nGoodreads (History/Biography) &          135000 &  136000 &