In [1]:
import os

if os.path.isdir("../notebooks/"):
    os.chdir("..")

In [2]:
import re
import json

import spacy
from tqdm import tqdm
import pandas as pd
import numpy as np

from badseeds import preprocess

In [3]:
# path to config json file containing paths to datasets. change if necessary
CONFIG_PATH = "./config.json"

In [4]:
with open(CONFIG_PATH, "r") as f:
    config = json.load(f)

## Preprocessing

If you already have saved proprocessed the datasets, preprocessing can be skipped, reading the preprocessed results from disk (Default). Otherwise, change the `PREPROC_NOW` flag to `True` to preprocess the data now. This will take a long time.


In [8]:
PREPROC_NOW = False
# if your paths are different, change them accordingly (paths to preproc files)
pproc_data_path = os.path.join(
    config["preprocessed"]["dir_path"],
    "processed",
)
NYT_PATH = os.path.join(pproc_data_path, "nytimes_news_articles.bin")
WIKI_PATH = False#os.path.join(pproc_data_path, "wiki.train.tokens.bin")
GRR_PATH = os.path.join(pproc_data_path, "romance")
GRHB_PATH = os.path.join(pproc_data_path, "history_biography")
# preprocess if requested. This will save results to disk.
if PREPROC_NOW:
    preprocess.preprocess_datasets()
# read preprocessed results from disk
pproc_data = preprocess.read_pproc_datasets(NYT_PATH, WIKI_PATH, GRR_PATH, GRHB_PATH)

reading nyt
reading goodreads romance
Directory detected, reading and concatenating all containing files


100%|█████████████████████████████████████████████| 8/8 [00:33<00:00,  4.13s/it]


reading goodreads history/biography
Directory detected, reading and concatenating all containing files


100%|█████████████████████████████████████████████| 5/5 [00:21<00:00,  4.33s/it]


## Table 2

### Prepare

In [9]:
table_2 = pd.DataFrame(
    index=pd.Series(
        ["NYT", "WikiText", "Goodreads (Romance)", "Goodreads (History/Biography)"],
        name="Dataset",
    ),
    columns=[
        "Total Documents_theirs",
        "Total Documents_ours",
        "Total Words_theirs",
        "Total Words_ours",
        "Vocabulary Size_theirs",
        "Vocabulary Size_ours",
        "Mean Document Length_theirs",
        "Mean Document Length_ours",
    ],
)
table_2.columns = table_2.columns.str.split("_", expand=True)
table_2[("Total Documents", "theirs")] = [8888, 28472, 197000, 136000]
table_2[("Total Words", "theirs")] = [7244457, 99197146, 24856924, 14324947]
table_2[("Vocabulary Size", "theirs")] = [162998, 546828, 214572, 163171]
table_2[("Mean Document Length", "theirs")] = [815, 3484, 126, 105]

In [10]:
table_2

Unnamed: 0_level_0,Total Documents,Total Documents,Total Words,Total Words,Vocabulary Size,Vocabulary Size,Mean Document Length,Mean Document Length
Unnamed: 0_level_1,theirs,ours,theirs,ours,theirs,ours,theirs,ours
Dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
NYT,8888,,7244457,,162998,,815,
WikiText,28472,,99197146,,546828,,3484,
Goodreads (Romance),197000,,24856924,,214572,,126,
Goodreads (History/Biography),136000,,14324947,,163171,,105,


### Total Documents

In [11]:
table_2[("Total Documents", "ours")] = [len(docs) for _k, docs in pproc_data.items()]

In [12]:
table_2

Unnamed: 0_level_0,Total Documents,Total Documents,Total Words,Total Words,Vocabulary Size,Vocabulary Size,Mean Document Length,Mean Document Length
Unnamed: 0_level_1,theirs,ours,theirs,ours,theirs,ours,theirs,ours
Dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
NYT,8888,8888,7244457,,162998,,815,
WikiText,28472,0,99197146,,546828,,3484,
Goodreads (Romance),197000,194500,24856924,,214572,,126,
Goodreads (History/Biography),136000,135000,14324947,,163171,,105,


### Total Words

In [75]:
all_words = [[],[],[],[]]
all_docs = [[],[],[],[]]
for index, (_k, docs) in enumerate(pproc_data.items()):
    for doc in tqdm(docs):
        current_doc = []
        for token in doc:
            word = re.sub(r'[^\w\s]','',token.text).lower().rstrip()
            if word != '':
                all_words[index].append(word)
                current_doc.append(word)
        all_docs[index].append(current_doc)
            

100%|██████████████████████████████████████| 8888/8888 [00:10<00:00, 879.73it/s]
0it [00:00, ?it/s]
100%|█████████████████████████████████| 194500/194500 [00:33<00:00, 5881.30it/s]
100%|█████████████████████████████████| 135000/135000 [00:19<00:00, 6889.30it/s]


In [61]:
from collections import Counter

In [77]:
counters = []
for i in all_words:
    counters.append(Counter(i))

In [78]:
# how many alphanumeric tokens
# table_2[("Total Words", "ours")] = [
#     np.sum(
#         [
#             np.sum(doc.to_array(["IS_ALPHA", "IS_DIGIT"]).any(axis=1))
#             for doc in tqdm(docs)
#         ]
#     )
#     for _k, docs in pproc_data.items()
# ]
counts = []
for counter in counters:
    dictt = {x: count for x, count in counter.items() if count >= 10 and x.isalnum()}
    counts.append(sum(dictt.values()))
    
table_2[("Total Words", "ours")] = counts

In [79]:
table_2

Unnamed: 0_level_0,Total Documents,Total Documents,Total Words,Total Words,Vocabulary Size,Vocabulary Size,Mean Document Length,Mean Document Length
Unnamed: 0_level_1,theirs,ours,theirs,ours,theirs,ours,theirs,ours
Dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
NYT,8888,8888,7244457,7204806,162998,106912,815,954.72896
WikiText,28472,0,99197146,0,546828,0,3484,
Goodreads (Romance),197000,194500,24856924,24286822,214572,230913,126,144.757728
Goodreads (History/Biography),136000,135000,14324947,14010924,163171,177532,105,119.166896


In [80]:
# how many tokens in general
# [np.sum([len(doc) for doc in tqdm(docs)]) for _k, docs in pproc_data.items()]

### Vocabulary Size

In [81]:
vocab_sizes = [len(set(i)) for i in all_words] 
# for key, docs in pproc_data.items():
#     print(key)
#     vocab = set()
#     for doc in tqdm(docs):
#         for token in doc:
#             vocab.add(token.text)
#     vocab_sizes.append(len(vocab))

In [82]:
table_2[("Vocabulary Size", "ours")] = vocab_sizes

In [83]:
table_2

Unnamed: 0_level_0,Total Documents,Total Documents,Total Words,Total Words,Vocabulary Size,Vocabulary Size,Mean Document Length,Mean Document Length
Unnamed: 0_level_1,theirs,ours,theirs,ours,theirs,ours,theirs,ours
Dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
NYT,8888,8888,7244457,7204806,162998,106911,815,954.72896
WikiText,28472,0,99197146,0,546828,0,3484,
Goodreads (Romance),197000,194500,24856924,24286822,214572,230912,126,144.757728
Goodreads (History/Biography),136000,135000,14324947,14010924,163171,177531,105,119.166896


### Mean Document Length

In [87]:
# table_2[("Mean Document Length", "ours")] = [
#     np.mean([len(doc) for doc in tqdm(docs)]) for _k, docs in pproc_data.items()
# ]
lengths = [[],[0],[],[]]
for index, i in enumerate(all_docs):
    for doc in i:
        leng = 0
        for token in doc:
            if counters[index][token] >= 10:
                leng += 1
                
                
        lengths[index].append(leng)
table_2[("Mean Document Length", "ours")] = [np.mean(i) for i in lengths]

In [88]:
# print(all_docs[0][0])

In [89]:
table_2

Unnamed: 0_level_0,Total Documents,Total Documents,Total Words,Total Words,Vocabulary Size,Vocabulary Size,Mean Document Length,Mean Document Length
Unnamed: 0_level_1,theirs,ours,theirs,ours,theirs,ours,theirs,ours
Dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
NYT,8888,8888,7244457,7204806,162998,106911,815,811.060644
WikiText,28472,0,99197146,0,546828,0,3484,0.0
Goodreads (Romance),197000,194500,24856924,24286822,214572,230912,126,124.9069
Goodreads (History/Biography),136000,135000,14324947,14010924,163171,177531,105,103.806593


### Output

In [90]:
# prepare table for LaTeX, copy paste into https://www.tablesgenerator.com/latex_tables
table_2.round().astype(int).style.format("{:,d}")

Unnamed: 0_level_0,Total Documents,Total Documents,Total Words,Total Words,Vocabulary Size,Vocabulary Size,Mean Document Length,Mean Document Length
Unnamed: 0_level_1,theirs,ours,theirs,ours,theirs,ours,theirs,ours
Dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
NYT,8888,8888,7244457,7204806,162998,106911,815,811
WikiText,28472,0,99197146,0,546828,0,3484,0
Goodreads (Romance),197000,194500,24856924,24286822,214572,230912,126,125
Goodreads (History/Biography),136000,135000,14324947,14010924,163171,177531,105,104
