In [1]:
import spacy

In [2]:
spacy.cli.download('en_core_web_sm')

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
nlp = spacy.load('en_core_web_sm')

# Tokenization

In [4]:
doc = nlp("Apple is looking to buy UK startup for $1 billion")

In [5]:
doc

Apple is looking to buy UK startup for $1 billion

In [6]:
for i in doc:
    print(i.text)   

Apple
is
looking
to
buy
UK
startup
for
$
1
billion


# Adding special case tokenization rule

In [4]:
from spacy.symbols import ORTH

In [8]:
doc2 = nlp("gimme that")
doc2

gimme that

In [9]:
for token in doc2:
    print(token.text)

gimme
that


In [12]:
special_case = [{ORTH: "gim"}, {ORTH: "me"}]

In [13]:
special_case

[{65: 'gim'}, {65: 'me'}]

In [14]:
nlp.tokenizer.add_special_case("gimme", special_case)

In [15]:
for token in nlp("gimme that"):
    print(token.text)

gim
me
that


# Parts Of Speech(POS)

In [5]:
from spacy import displacy

In [17]:
doc3 = nlp("Python is a programming language. Current year is 2024. Dollar symbol is $")
doc3

Python is a programming language. Current year is 2024. Dollar symbol is $

In [18]:
for token in doc3:
    print(token.text)

Python
is
a
programming
language
.
Current
year
is
2024
.
Dollar
symbol
is
$


In [19]:
for token in doc3:
    print(token, "->", token.pos_)

Python -> PROPN
is -> AUX
a -> DET
programming -> NOUN
language -> NOUN
. -> PUNCT
Current -> ADJ
year -> NOUN
is -> AUX
2024 -> NUM
. -> PUNCT
Dollar -> NOUN
symbol -> NOUN
is -> AUX
$ -> SYM


In [20]:
for token in doc3:
    print(token, "->", token.pos)

Python -> 96
is -> 87
a -> 90
programming -> 92
language -> 92
. -> 97
Current -> 84
year -> 92
is -> 87
2024 -> 93
. -> 97
Dollar -> 92
symbol -> 92
is -> 87
$ -> 99


We see that dollar symbol both have number 92 -> ie they both are noun(same pos) and noun is denoted by number 92

In [21]:
displacy.serve(doc3, style="dep") #to display in form of chart




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



# Stopwords