In [1]:
import pickle

In [2]:
import re

In [3]:
import nltk

try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")

In [4]:
import spacy

nlp = spacy.load(
    "en_core_web_sm", disable=["ner", "lemmatizer", "parser", "attribute_ruler"]
)

In [5]:
print(nlp.pipe_names)

['tok2vec', 'tagger']


In [6]:
from tqdm import tqdm

In [7]:
with open("../data/WikiText103/wikitext-103/wiki.train.tokens", "r") as f:
    lines = f.readlines()

In [8]:
title_regex = re.compile(" = .* = \n")
subtitle_regex = re.compile(" = = .* = = \n")
new_line_regex = re.compile(" \n")

In [9]:
doc_idxs = [
    idx
    for idx, (prev_line, cur_line, next_line) in enumerate(
        zip(lines[:-1], lines[1:], lines[2:]), 1
    )
    if new_line_regex.match(prev_line)
    and new_line_regex.match(next_line)
    and title_regex.match(cur_line)
    and not subtitle_regex.match(cur_line)
] + [len(lines) - 1]

In [10]:
documents = [
    "".join(lines[start_idx:end_idx]).replace("\n", "").strip().lower()
    for start_idx, end_idx in zip(doc_idxs, doc_idxs[1:])
]

In [21]:
len(documents)

28472

## spacy stuff starts here

In [11]:
from spacy.tokens import DocBin

In [12]:
doc_bin = DocBin(attrs=["TAG"])

In [13]:
# for demonstration purposes, im only doing this for 100 docs, to save time
for doc in tqdm(
    nlp.pipe(
        documents[:100], disable=["ner", "lemmatizer", "parser", "attribute_ruler"]
    ),
    total=len(documents[:100]),
):
    doc_bin.add(doc)

100%|████████████████████| 100/100 [00:30<00:00,  3.24it/s]


In [14]:
bytes_data = doc_bin.to_bytes()

### this is how you would save to disk

In [15]:
with open("test.bin", "wb") as f:
    f.write(bytes_data)

### pretend this is a new process, where you are reading from disk

and want to restore your docs

In [16]:
# read from disk
with open("test.bin", "rb") as nf:
    disk_bytes_data = nf.read()

In [17]:
# parse spacy bytes
new_doc_bin = DocBin().from_bytes(disk_bytes_data)

In [18]:
# you need to load the language again for the vocab i think
new_nlp = spacy.load(
    "en_core_web_sm", disable=["ner", "lemmatizer", "parser", "attribute_ruler"]
)

In [19]:
# can finally recover your docs
docs = list(new_doc_bin.get_docs(new_nlp.vocab))

In [20]:
# here's the first doc
docs[0]

= valkyria chronicles iii =   senjō no valkyria 3 : <unk> chronicles ( japanese : 戦場のヴァルキュリア3 , lit . valkyria of the battlefield 3 ) , commonly referred to as valkyria chronicles iii outside japan , is a tactical role @-@ playing video game developed by sega and media.vision for the playstation portable . released in january 2011 in japan , it is the third game in the valkyria series . employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " nameless " , a penal military unit serving the nation of gallia during the second europan war who perform secret black operations and are pitted against the imperial unit " <unk> raven " .  the game began development in 2010 , carrying over a large portion of the work done on valkyria chronicles ii . while it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series newcomers . ch