In [1]:
import spacy

In [2]:
spacy.cli.download('en_core_web_sm')

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
nlp = spacy.load('en_core_web_sm')

# Tokenization

In [4]:
doc = nlp("Apple is looking to buy UK startup for $1 billion")

In [5]:
doc

Apple is looking to buy UK startup for $1 billion

In [6]:
for i in doc:
    print(i.text)   

Apple
is
looking
to
buy
UK
startup
for
$
1
billion


# Adding special case tokenization rule

In [4]:
from spacy.symbols import ORTH

In [8]:
doc2 = nlp("gimme that")
doc2

gimme that

In [9]:
for token in doc2:
    print(token.text)

gimme
that


In [12]:
special_case = [{ORTH: "gim"}, {ORTH: "me"}]

In [13]:
special_case

[{65: 'gim'}, {65: 'me'}]

In [14]:
nlp.tokenizer.add_special_case("gimme", special_case)

In [15]:
for token in nlp("gimme that"):
    print(token.text)

gim
me
that


# Parts Of Speech(POS)

In [5]:
from spacy import displacy

In [17]:
doc3 = nlp("Python is a programming language. Current year is 2024. Dollar symbol is $")
doc3

Python is a programming language. Current year is 2024. Dollar symbol is $

In [18]:
for token in doc3:
    print(token.text)

Python
is
a
programming
language
.
Current
year
is
2024
.
Dollar
symbol
is
$


In [19]:
for token in doc3:
    print(token, "->", token.pos_)

Python -> PROPN
is -> AUX
a -> DET
programming -> NOUN
language -> NOUN
. -> PUNCT
Current -> ADJ
year -> NOUN
is -> AUX
2024 -> NUM
. -> PUNCT
Dollar -> NOUN
symbol -> NOUN
is -> AUX
$ -> SYM


In [20]:
for token in doc3:
    print(token, "->", token.pos)

Python -> 96
is -> 87
a -> 90
programming -> 92
language -> 92
. -> 97
Current -> 84
year -> 92
is -> 87
2024 -> 93
. -> 97
Dollar -> 92
symbol -> 92
is -> 87
$ -> 99


We see that dollar symbol both have number 92 -> ie they both are noun(same pos) and noun is denoted by number 92

In [21]:
displacy.serve(doc3, style="dep") #to display in form of chart




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



# Stopwords

In [6]:
from spacy.lang.en.stop_words import STOP_WORDS

In [7]:
print(STOP_WORDS)

{'side', 'twenty', 'top', 'forty', 'put', 'which', 'any', 'his', 'nor', 'nevertheless', 'due', 'hereafter', 'ca', 'of', 'seem', 'in', 'other', 'whenever', 'the', 'would', 'out', '‘ve', 'always', 'various', 'though', 'still', 'there', 'within', 'beforehand', 'to', 'with', 'if', 'nobody', 'see', 'first', 'from', 'he', "'m", 'much', 'fifteen', 'nothing', 'say', 'sixty', "'d", 'else', 'everyone', 'really', 'anything', 'under', 'does', 'make', 'after', "n't", 'full', 'only', '’ll', 'yet', 'none', 're', 'three', '‘re', 'wherein', 'hence', 'beyond', 'get', 'even', 'anyway', 'up', 'namely', 'becoming', 'off', 'hereby', '’re', 'further', 'since', 'without', 'your', 'both', 'either', 'so', 'them', 'others', 'i', 'against', 'neither', 'between', 'six', 'alone', 'whither', 'nowhere', 'for', 'thereupon', 'behind', 'below', 'part', 'herein', '‘ll', 'take', 'be', 'has', 'often', 'unless', 'became', 'here', 'thence', 'were', '‘s', 'indeed', 'hereupon', 'might', 'anywhere', 'an', '‘m', "'ve", 'over', "

In [9]:
"in" in STOP_WORDS #as "in" is a stop word it prints True

True

In [10]:
"apple" in STOP_WORDS #as "apple" is a not a stop word it prints False

False

In [11]:
nlp.vocab['apple'].is_stop #other method to check if a word is a stop word

False

In [12]:
doc4 = nlp("Python is a programming language. I am learning Natural Language Processing")

In [13]:
doc4

Python is a programming language. I am learning Natural Language Processing

In [14]:
for token in doc4:
    print(token.text)

Python
is
a
programming
language
.
I
am
learning
Natural
Language
Processing


In [16]:
for token in doc4:
    if token.is_stop == True: #if token is a stop word
        print(token.text)

is
a
I
am


In [17]:
for token in doc4:
    if token.is_stop == False: #if token is not a stop word
        print(token.text)

Python
programming
language
.
learning
Natural
Language
Processing


# Name Entity Recognition

In [18]:
doc5 = nlp("Apple is looking at buying U.K. startup for $1 billion")
doc5

Apple is looking at buying U.K. startup for $1 billion

In [19]:
for token in doc5:
    print(token.text, "->", token.pos_)

Apple -> PROPN
is -> AUX
looking -> VERB
at -> ADP
buying -> VERB
U.K. -> PROPN
startup -> NOUN
for -> ADP
$ -> SYM
1 -> NUM
billion -> NUM


In [21]:
from spacy import displacy

In [22]:
displacy.render(doc5,style="ent",jupyter=True) #ent means entity

We see that entities are highlighted in different colors

In [24]:
for entity in doc5.ents:
    if entity.label_=='ORG':
        print(entity.text)

Apple


# Lemmatization


Lemmatization is a linguistic process that involves reducing words to their base or root form, known as the lemma. The goal is to group together different forms of a word so they can be analyzed as a single item. This helps in tasks like text analysis, information retrieval, and natural language processing.

For example, consider the words "running," "ran," and "runs." The lemma for all these words is "run." Lemmatization would convert all these variations to the base form "run."