In [1]:
import spacy
import nltk

In [5]:
# load english language model
en_nlp = spacy.load('en')

In [6]:
# stemmer Porter
stemmer = nltk.stem.PorterStemmer()

In [7]:
def compare_normalization(doc):
    doc_spacy = en_nlp(doc)
    
    # lemmatization
    print("Lemmatization")
    print([token.lemma_ for token in doc_spacy])
    
    # Stemming
    print("Stemming")
    print([stemmer.stem(token.norm_.lower()) for token in doc_spacy])

In [14]:
compare_normalization("Our meeting today was worse than yesterday, I'm scared of meeting the clients tomorrow.")

Lemmatization
['-PRON-', 'meeting', 'today', 'be', 'bad', 'than', 'yesterday', ',', '-PRON-', 'be', 'scared', 'of', 'meet', 'the', 'client', 'tomorrow', '.']
Stemming
['our', 'meet', 'today', 'wa', 'wors', 'than', 'yesterday', ',', 'i', 'am', 'scare', 'of', 'meet', 'the', 'client', 'tomorrow', '.']


In [15]:
# advance tokenization
import re

regexp = re.compile('(?u)\\b\\w\\w+\\b')

# load spacy
en_nlp = spacy.load('en')
old_tokenizer = en_nlp.tokenizer

# replace tokenization dengan campuran regexp]
en_nlp.tokenizer = lambda string: old_tokenizer.tokens_from_list(regexp.findall(string))

In [34]:
from spacy.tokens import Doc

def custom_tokenizer(document):
    doc_spacy = en_nlp(document)
    doc_spacy = Doc(doc_spacy.vocab, words=[t.text for t in doc_spacy])
    return [token.lemma_ for token in doc_spacy]

In [35]:
custom_tokenizer("Our meeting today was worse than yesterday, I'm scared of meeting the clients tomorrow.")

['Our',
 'meet',
 'today',
 'be',
 'wrong',
 'than',
 'yesterday',
 'scare',
 'of',
 'meet',
 'the',
 'client',
 'tomorrow']