In [52]:
import spacy

### Tokenizer Implementation
1. Create a blank spacy object
2. By default it creates a tokenizer
3. Text is converted to tokens fed into a doc

In [53]:
nlp = spacy.blank("en")
doc = nlp("Dr. Strange loves pav bhaji as it just costs $5 per plate!")
for token in doc:
    print(token)

Dr.
Strange
loves
pav
bhaji
as
it
just
costs
$
5
per
plate
!


### Basic Tokenization 
1. Based on grammatical rules tokenization can be done 
2. Few rules for creating tokens - parse prefix, exception, suffix

In [54]:
type(nlp)

spacy.lang.en.English

In [55]:
doc = nlp("Tony gave two $ to Peter")

In [56]:
token0 = doc[0]
token0

Tony

In [57]:
token2 = doc[2]
token2

two

In [58]:
## Can detect numbers written as text
token2.like_num

True

In [59]:
## Can detect currency symbols
token3 = doc[3]
token3.is_currency

True

In [60]:
for token in doc:
    print(token, "index:", token.i,
    "is_alpha:", token.is_alpha,
    "is_punct:", token.is_punct,
    "like_num:", token.like_num,
    "is_currency:", token.is_currency,
    )

Tony index: 0 is_alpha: True is_punct: False like_num: False is_currency: False
gave index: 1 is_alpha: True is_punct: False like_num: False is_currency: False
two index: 2 is_alpha: True is_punct: False like_num: True is_currency: False
$ index: 3 is_alpha: False is_punct: False like_num: False is_currency: True
to index: 4 is_alpha: True is_punct: False like_num: False is_currency: False
Peter index: 5 is_alpha: True is_punct: False like_num: False is_currency: False


In [61]:
with open("dummy_data.txt") as f:
    text = f.readlines()
text

['High School Match Details\n',
 '------------------------------------------------------------------------------------------\n',
 'S.No \t\t\t\tName \t\t\t\t\tEmail\n',
 '------------------------------------------------------------------------------------------\n',
 '1. \t\t\t\tMarie\t\t\t\t\tmarie@gmail.com\n',
 '2. \t\t\t\tChris\t\t\t\t\tchris12@gmail.com\n',
 '3.\t\t\t\tPam\t\t\t\t\tpam_1@gmail.com\n',
 '4. \t\t\t\tDev\t\t\t\t\tdev.indie@gmail.com']

In [62]:
## Convert entire doc into one huge sentence
## Sentences are separated by spaces
text = " ".join(text)

In [63]:
doc = nlp(text)
emails = []
for token in doc:
    if token.like_email:
        emails.append(token.text)
emails

['marie@gmail.com',
 'chris12@gmail.com',
 'pam_1@gmail.com',
 'dev.indie@gmail.com']

### Tokenization cannot modify but only split parts of a sentence

In [64]:
## Cannot use give, me here
from spacy.symbols import ORTH
nlp.tokenizer.add_special_case("gimme", [
    {ORTH: "gim"},
    {ORTH: "me"}
])
## Can split gimme to "gim", "me"
doc = nlp("yo gimme two cheese bursts with extra topping mushrooms peppers and olives")
tokens = [token.text for token in doc]
tokens

['yo',
 'gim',
 'me',
 'two',
 'cheese',
 'bursts',
 'with',
 'extra',
 'topping',
 'mushrooms',
 'peppers',
 'and',
 'olives']

In [65]:
## Build a pipeline to parse a sentence
nlp.add_pipe("sentencizer")
doc = nlp("Dr. Strange loves pav bhaji as it just costs $5 per plate!")
for sentence in doc.sents:
    print(sentence)

Dr. Strange loves pav bhaji as it just costs $5 per plate!
