In [1]:
import os

if os.path.isdir("../notebooks/"):
    os.chdir("..")

In [2]:
import re

import spacy
from tqdm.auto import tqdm
import pandas as pd
import numpy as np

from badseeds import preprocess

ModuleNotFoundError: No module named 'numpy.typing'

## Preprocessing

If you already have saved proprocessed the datasets, preprocessing can be skipped, reading the preprocessed results from disk (Default). Otherwise, change the `PREPROC_NOW` flag to `True` to preprocess the data now. This will take a long time.


In [None]:
PREPROC_NOW = False
# if your paths are different, change them accordingly (paths to preproc files)
NYT_PATH = "data/processed/nytimes_news_articles.bin"
WIKI_PATH = "data/processed/wiki.train.tokens.bin"
GRR_PATH = "data/processed/romance"
GRHB_PATH = "data/processed/history_biography"
# preprocess if requested. This will save results to disk.
if PREPROC_NOW:
    preprocess.preprocess_datasets()
# read preprocessed results from disk
pproc_data = preprocess.read_preprocessed_datasets(
    NYT_PATH, WIKI_PATH, GRR_PATH, GRHB_PATH
)

## Table 2

### Prepare

In [None]:
table_2 = pd.DataFrame(
    index=pd.Series(
        ["NYT", "WikiText", "Goodreads (Romance)", "Goodreads (History/Biography)"],
        name="Dataset",
    ),
    columns=[
        "Total Documents_theirs",
        "Total Documents_ours",
        "Total Words_theirs",
        "Total Words_ours",
        "Vocabulary Size_theirs",
        "Vocabulary Size_ours",
        "Mean Document Length_theirs",
        "Mean Document Length_ours",
    ],
)
table_2.columns = table_2.columns.str.split("_", expand=True)
table_2[("Total Documents", "theirs")] = [8888, 28472, 197000, 136000]
table_2[("Total Words", "theirs")] = [7244457, 99197146, 24856924, 14324947]
table_2[("Vocabulary Size", "theirs")] = [162998, 546828, 214572, 163171]
table_2[("Mean Document Length", "theirs")] = [815, 3484, 126, 105]

In [None]:
table_2

### Total Documents

In [None]:
table_2[("Total Documents", "ours")] = [len(docs) for _k, docs in pproc_data.items()]

In [None]:
table_2

### Total Words

In [None]:
# how many alphanumeric tokens
table_2[("Total Words", "ours")] = [
    np.sum(
        [
            np.sum(doc.to_array(["IS_ALPHA", "IS_DIGIT"]).any(axis=1))
            for doc in tqdm(docs)
        ]
    )
    for _k, docs in pproc_data.items()
]

In [None]:
table_2

In [None]:
# how many tokens in general
# [np.sum([len(doc) for doc in tqdm(docs)]) for _k, docs in pproc_data.items()]

### Vocabulary Size

In [None]:
vocab_sizes = []
for key, docs in pproc_data.items():
    print(key)
    vocab = set()
    for doc in tqdm(docs):
        for token in doc:
            vocab.add(token.text)
    vocab_sizes.append(len(vocab))

In [None]:
table_2[("Vocabulary Size", "ours")] = vocab_sizes

In [None]:
table_2

### Mean Document Length

In [None]:
table_2[("Mean Document Length", "ours")] = [
    np.mean([len(doc) for doc in tqdm(docs)]) for _k, docs in pproc_data.items()
]

In [None]:
table_2

In [None]:
table_2.to_latex(index=False, caption='This')