In [9]:
import re
import spacy
from tqdm import tqdm
import pandas as pd
import numpy as np

## Table 2

### Format

In [2]:
table_2 = pd.DataFrame(
    index=pd.Series(
        ["NYT", "WikiText", "Goodreads (Romance)", "Goodreads (History/Biography)"],
        name="Dataset",
    ),
    columns=[
        "Total Documents",
        "Total Words",
        "Vocabulary Size",
        "Mean Document Length",
    ],
)

In [3]:
table_2

Unnamed: 0_level_0,Total Documents,Total Words,Vocabulary Size,Mean Document Length
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NYT,,,,
WikiText,,,,
Goodreads (Romance),,,,
Goodreads (History/Biography),,,,


### Preprocessing

If you already have saved proprocessed files, can skip preprocessing and just load them from disk


In [4]:
# tokenizer only
nlp = spacy.load(
    "en_core_web_sm",
    disable=["tok2vec", "tagger", "ner", "lemmatizer", "parser", "attribute_ruler"],
)

In [5]:
path_to_file = "../data/WikiText103/wikitext-103/wiki.train.tokens"
with open(path_to_file, "r") as f:
    lines = f.readlines()
# remove lines with formulas
lines = [line for line in lines if line != " <formula> \n"]
# determine document start indexes using combination of regex patterns
title_regex = re.compile(" = .* = \n")
subtitle_regex = re.compile(" = = .* = = \n")
new_line_regex = re.compile(" \n")
doc_idxs = [
    idx
    for idx, (prev_line, cur_line, next_line) in enumerate(
        zip(lines[:-1], lines[1:], lines[2:]), 1
    )
    if new_line_regex.match(prev_line)
    and new_line_regex.match(next_line)
    and title_regex.match(cur_line)
    and not subtitle_regex.match(cur_line)
] + [len(lines) - 1]
# split/concat lines into docs; lowercase; remove newlines and formulas
documents = [
    "".join(lines[start_idx:end_idx])
    # .replace("\n", "")
    # .replace("<formula>", "")
    .strip()
    .lower()
    for start_idx, end_idx in zip(doc_idxs, doc_idxs[1:])
]

In [6]:
wiki_docs = [nlp(doc) for doc in tqdm(documents)]

100%|██████████████████████████████████████| 28472/28472 [03:20<00:00, 141.84it/s]


### WikiText

#### Total Documents

In [7]:
table_2.loc["WikiText"]["Total Documents"] = len(wiki_docs)

#### Total Words

In [20]:
# how many alphanumeric tokens
np.sum(
    [
        np.sum(doc.to_array(["IS_ALPHA", "IS_DIGIT"]).any(axis=1))
        for doc in tqdm(wiki_docs)
    ]
)

100%|█████████████████████████████████████| 28472/28472 [00:22<00:00, 1248.82it/s]


85327167

In [None]:
85325590

In [12]:
# just every token
np.sum(
    [
        len(doc)
        for doc in tqdm(wiki_docs)
    ]
)

100%|███████████████████████████████████| 28472/28472 [00:00<00:00, 748575.34it/s]


103620897

In [None]:
# weird, should be getting 99 mil at some point...