# Chapter 2 spaCy Features

## Lemmatization

#### A lemma is the base form of a word

In [3]:
import spacy


In [4]:
nlp = spacy.load("en_core_web_sm")

In [7]:
doc = nlp("I have enjoyed working there for 3 years")

In [10]:
for token in doc:
    print(f"{token.text}, lemma: {token.lemma_}")

I, lemma: I
have, lemma: have
enjoyed, lemma: enjoy
working, lemma: work
there, lemma: there
for, lemma: for
3, lemma: 3
years, lemma: year


## Tokenization
#### Altinok defines it as the smallest unit of a word. Tokenization is the process of saving each unit as a segment within a doc object.

In [1]:
import spacy

In [2]:
from spacy.attrs import ORTH, NORM
from spacy.tokenizer import Tokenizer

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
tokenizer = Tokenizer(nlp.vocab)

In [5]:
case = [{ORTH: "do"}, {ORTH: "n't", NORM: "not"}]
tokenizer.add_special_case("don't", case)

### retrievable doc properties

In [33]:
doc = nlp(u"I don't like to watch football")

In [34]:
for token in doc:
    print(token.text, token.lemma_)

I I
do do
n't not
like like
to to
watch watch
football football


In [49]:
doc[0].text

'I'

In [38]:
#each sentence is a span object
sentences = list(doc.sents)
print(sentences)

[I don't like to watch football]


In [40]:
doc.ents

()

In [42]:
list(doc.noun_chunks)

[I, football]

In [44]:
json_doc = doc.to_json()
print(json_doc)

{'text': "I don't like to watch football", 'ents': [], 'sents': [{'start': 0, 'end': 30}], 'tokens': [{'id': 0, 'start': 0, 'end': 1, 'tag': 'PRP', 'pos': 'PRON', 'morph': 'Case=Nom|Number=Sing|Person=1|PronType=Prs', 'lemma': 'I', 'dep': 'nsubj', 'head': 3}, {'id': 1, 'start': 2, 'end': 4, 'tag': 'VBP', 'pos': 'AUX', 'morph': 'Mood=Ind|Tense=Pres|VerbForm=Fin', 'lemma': 'do', 'dep': 'aux', 'head': 3}, {'id': 2, 'start': 4, 'end': 7, 'tag': 'RB', 'pos': 'PART', 'morph': 'Polarity=Neg', 'lemma': 'not', 'dep': 'neg', 'head': 3}, {'id': 3, 'start': 8, 'end': 12, 'tag': 'VB', 'pos': 'VERB', 'morph': 'VerbForm=Inf', 'lemma': 'like', 'dep': 'ROOT', 'head': 3}, {'id': 4, 'start': 13, 'end': 15, 'tag': 'TO', 'pos': 'PART', 'morph': '', 'lemma': 'to', 'dep': 'aux', 'head': 5}, {'id': 5, 'start': 16, 'end': 21, 'tag': 'VB', 'pos': 'VERB', 'morph': 'VerbForm=Inf', 'lemma': 'watch', 'dep': 'xcomp', 'head': 3}, {'id': 6, 'start': 22, 'end': 30, 'tag': 'NN', 'pos': 'NOUN', 'morph': 'Number=Sing', 'l

### token properties

In [68]:
#each word is a token object
print(f"token.text: {token.text}")

token.text: watch


In [69]:
#token with trailing whitespace
print(f"token.text_with_ws: {token.text_with_ws}") 


token.text_with_ws: watch 


In [70]:
#retrieve the index of the token in doc
token = doc[5]
print(f"token.i: {token.i}")


token.i: 5


In [71]:
#gives the token's character's position in the doc
token.idx 
 

16

In [72]:
#retrieve the document the created the token
token.doc 


I don't like to watch football

In [73]:
#the sentence that the token belongs to
token.sent 


I don't like to watch football

In [74]:
#whether the token starts a sentence
token.is_sent_start 


False

In [75]:
#retrieve entity type for token
token.ent_type_

''

In [67]:
token.like_url
token.like_num
token.like_email

False

In [77]:
#if token is out of vocabulary
token.is_oov

True

In [78]:
for token in doc:
    print(token, token.is_oov)

I True
do True
n't True
like True
to True
watch True
football True


In [79]:
#if token is stop word - words that don't carry much meaning
for token in doc:
    print(token, token.is_stop)

I True
do True
n't True
like False
to True
watch False
football False


### Span objects contain segments of text, as a sequence of tokens
- Slice a doc object


In [41]:
doc = nlp("I lost myself after I saw you")

In [None]:
#represent phrases or segments of text - contiguous sequence of tokens
doc[1:12]

In [25]:
for token in span:
    print(token)

### Other features

In [48]:
doc[0].is_upper

True

In [49]:
doc[0].is_lower

False

In [50]:
doc[0].is_alpha

True

In [None]:
doc[0].is_digit

In [None]:
doc[0].is_ascii

In [None]:
doc[0].is_punct

In [None]:
doc[0].is_space

In [None]:
doc[0].is_bracket

In [None]:
doc[0].is_quote

In [None]:
doc[0].is_currency

In [None]:
doc[0].like_url

In [None]:
doc[0].like_num

In [None]:
doc[0].like_email

In [52]:
for token in doc:
    print(f"{token.text}, Shape: {token.shape_}")

I, Shape: X
lost, Shape: xxxx
myself, Shape: xxxx
after, Shape: xxxx
I, Shape: X
saw, Shape: xxx
you, Shape: xxx


In [53]:
#out of "trained" vocabulary
doc[0].is_oov

True

In [54]:
#stop word are filtered from the vocabulary
doc[0].is_stop

True