# Benchmarking spacy with optimizations

In [1]:
import spacy
from tqdm import tqdm

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
from datasets import load_dataset
dolly = load_dataset("databricks/databricks-dolly-15k")["train"].to_pandas()
sents = dolly.instruction.values.tolist()
len(sents)

15011

In [4]:
%%time
joined_pos = []
for sent in sents:
    doc = nlp(sent)
    joined_pos.append(' '.join([token.tag_ for token in doc]))

CPU times: user 58.5 s, sys: 347 ms, total: 58.9 s
Wall time: 58.9 s


In [5]:
nlp2 = spacy.load('en_core_web_sm', enable=["tok2vec", "tagger"])

In [6]:
%%time
joined_pos = []
for sent in sents:
    doc = nlp2(sent)
    joined_pos.append(' '.join([token.tag_ for token in doc]))

CPU times: user 22.6 s, sys: 88.8 ms, total: 22.7 s
Wall time: 22.7 s


In [7]:
%%time
docs = nlp2.pipe(sents, n_process=4, batch_size=1000)
joined_pos = []
for doc in docs:
    joined_pos.append(' '.join([token.tag_ for token in doc]))

CPU times: user 5.04 s, sys: 321 ms, total: 5.36 s
Wall time: 11.4 s
