In [1]:
import spacy

### Stemming vs Lemmatization

### Stemming:
1. Convert to its base, remove suffix to get base words -> "talking": "talk", "eating": "eat", "adjustable": "adjust"
2. No proper rules: "ability: "abil"
3. Spacy does not provide inbuilt support for Stemming
### Lemmatization
1. Use linguistic knowledge to derive a base word -> "ate": "eat"



In [23]:
import nltk
import spacy

In [24]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [25]:
## Stemming in NLTK
words = ["eating", "eats", "reads", "remarkable", "causality", "rendering", "ability"]

for word in words:
    print(word, "|", stemmer.stem(word))

eating | eat
eats | eat
reads | read
remarkable | remark
causality | causal
rendering | render
ability | abil


In [26]:
## Lemmatization
nlp = spacy.load("en_core_web_sm")
doc = nlp("eating eats eat ate adjustable remarkable rafting meeting better gone")

for token in doc:
    print(token , "|", token.lemma_)

eating | eat
eats | eat
eat | eat
ate | eat
adjustable | adjustable
remarkable | remarkable
rafting | raft
meeting | meeting
better | well
gone | go


In [27]:
doc = nlp("Resting must you will, for loss shall you suffer - Baby Yoda and other quotes he doesn't know about")

for token in doc:
    print(token , "|", token.lemma_ , "|", token.lemma)

Resting | rest | 10960894369163974213
must | must | 7290638946010101875
you | you | 7624161793554793053
will | will | 18307573501153647118
, | , | 2593208677638477497
for | for | 16037325823156266367
loss | loss | 80859674766354010
shall | shall | 7091588059074233151
you | you | 7624161793554793053
suffer | suffer | 12627334617871297789
- | - | 9153284864653046197
Baby | Baby | 16275649885857980966
Yoda | Yoda | 52316164129200410
and | and | 2283656566040971221
other | other | 1176656782636220709
quotes | quote | 5727435701664101727
he | he | 1655312771067108281
does | do | 2158845516055552166
n't | not | 447765159362469301
know | know | 7743033266031195906
about | about | 942632335873952620


In [28]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [29]:
## How do you interpret slangs? -> attribute_ruler
ar = nlp.get_pipe("attribute_ruler")
## Add exception for Brother, case-sensitive
ar.add([[{"TEXT":"Brotha"}],[{"TEXT":"bruh"}]], {"LEMMA": "Brother"})

doc = nlp("Brotha that shit slaps, bruh; Skibidi this, skibidi that, YOLO!!")

for token in doc:
    print(token.text, "|", token.lemma_)


Brotha | Brother
that | that
shit | shit
slaps | slap
, | ,
bruh | Brother
; | ;
Skibidi | skibidi
this | this
, | ,
skibidi | skibidi
that | that
, | ,
YOLO | yolo
! | !
! | !
