Это пример, как lru_cache можно применить для существенного ускорения обработки текста.

In [1]:
from typing import List
from functools import lru_cache

import pymorphy2
import nltk

In [2]:
nltk.download('brown')

[nltk_data] Downloading package brown to /Users/greenwolf/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [3]:
corpus = nltk.corpus.brown

In [4]:
words = corpus.words()

In [5]:
morph = pymorphy2.MorphAnalyzer()

In [6]:
%%timeit
morph.parse('хёндай')[0].normal_form

147 µs ± 2.27 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [7]:
def normalize(word: str, morph: pymorphy2.MorphAnalyzer):
    return morph.parse(word)[0].normal_form

@lru_cache(maxsize=len(set(words)))
def normalize_lru(word: str, morph: pymorphy2.MorphAnalyzer):
    return morph.parse(word)[0].normal_form
 

def normalize_corpus(words: List[str], morph: pymorphy2.MorphAnalyzer):
    return [normalize(word, morph) for word in words]


def normalize_corpus_lru(words: List[str], morph: pymorphy2.MorphAnalyzer):
    return [normalize_lru(word, morph) for word in words]

In [8]:
%%time
normalized = normalize_corpus(words, morph)

CPU times: user 11.3 s, sys: 104 ms, total: 11.4 s
Wall time: 11.4 s


In [9]:
%%time
normalized_lru = normalize_corpus_lru(words, morph)

CPU times: user 1.82 s, sys: 35.2 ms, total: 1.85 s
Wall time: 1.85 s


In [10]:
normalized == normalized_lru

True

Получили ускорение почти в 7 раз, добавив одну строку кода!