In [1]:
!pip install spacy
!python -m spacy download en

Collecting spacy
  Downloading spacy-3.1.3-cp39-cp39-win_amd64.whl (11.6 MB)
Collecting pathy>=0.3.5
  Using cached pathy-0.6.0-py3-none-any.whl (42 kB)
Collecting wasabi<1.1.0,>=0.8.1
  Using cached wasabi-0.8.2-py3-none-any.whl (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.5-cp39-cp39-win_amd64.whl (21 kB)
Collecting thinc<8.1.0,>=8.0.9
  Downloading thinc-8.0.10-cp39-cp39-win_amd64.whl (1.0 MB)
Collecting blis<0.8.0,>=0.4.0
  Downloading blis-0.7.4-cp39-cp39-win_amd64.whl (6.5 MB)
Collecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.5-cp39-cp39-win_amd64.whl (36 kB)
Collecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.5-cp39-cp39-win_amd64.whl (112 kB)
Collecting spacy-legacy<3.1.0,>=3.0.8
  Using cached spacy_legacy-3.0.8-py2.py3-none-any.whl (14 kB)
Collecting srsly<3.0.0,>=2.4.1
  Downloading srsly-2.4.1-cp39-cp39-win_amd64.whl (451 kB)
Collecting catalogue<2.1.0,>=2.0.6
  Using cached catalogue-2.0.6-py3-none-any.whl (17 kB)
Collecting typer<0

2021-10-20 09:16:41.300175: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2021-10-20 09:16:41.300215: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Tokenization

In [2]:
# Word tokenization
from spacy.lang.en import English

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

#  "nlp" Object is used to create documents with linguistic annotations.
my_doc = nlp(text)

In [3]:
my_doc

When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!

In [5]:
# Create list of word tokens
token_list = []

for token in my_doc:
    token_list.append(token.text)

print(token_list)

['When', 'learning', 'data', 'science', ',', 'you', 'should', "n't", 'get', 'discouraged', '!', '\n', 'Challenges', 'and', 'setbacks', 'are', "n't", 'failures', ',', 'they', "'re", 'just', 'part', 'of', 'the', 'journey', '.', 'You', "'ve", 'got', 'this', '!']


## Sentence Tokenization

In [16]:
# sentence tokenization

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

# Add the 'sentencizer' component to the pipeline
nlp.add_pipe('sentencizer')

text = """When learning data science, you shouldn't get discouraged! Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# create list of sentence tokens
sents_list = []

for sent in doc.sents:
    sents_list.append(sent.text)
    
print(sents_list)

["When learning data science, you shouldn't get discouraged!", "Challenges and setbacks aren't failures, they're just part of the journey.", "You've got this!"]


## StopWords

In [17]:
#importing stop words from English language.
from spacy.lang.en.stop_words import STOP_WORDS

#Printing the total number of stop words:
print('Number of stop words: %d' % len(STOP_WORDS))

Number of stop words: 326


In [21]:
print(STOP_WORDS)

{'they', 'every', 'over', 'regarding', 'becomes', "'ve", 'with', 'meanwhile', 'yourselves', 'under', 'has', 'itself', 'hundred', 'than', 'also', 'nothing', 'seemed', 'neither', 'here', 'due', 'side', 'are', 'that', 'us', 'had', 'his', 'anyway', 'n‘t', 'without', 'by', 'moreover', 'indeed', 'hereby', 'be', 'third', 'and', 'two', 'whole', 're', 'is', 'would', '‘re', 'much', 'hereafter', 'whither', "'s", 'move', "'d", "'m", 'so', 'other', 'down', 'can', 'himself', 'its', 'thence', 'via', '‘d', 'n’t', 'latter', 'get', 'next', 'off', 'in', 'nevertheless', 'although', 'just', 'thereafter', 'whose', 'ten', 'full', 'since', 'ever', "'ll", 'upon', 'thereupon', 'themselves', 'using', 'among', 'thru', 'somehow', 'empty', 'go', 'whereas', 'from', 'more', 'someone', 'ours', 'both', 'always', 'already', 'into', 'were', 'anyhow', 'must', 'below', 'forty', 'along', 'am', 'others', 'hence', 'her', 'therefore', 'on', '’d', 'hers', 'either', 'several', 'there', 'per', 'any', 'whether', "'re", 'why', 'six

## Removing Stopwords

In [24]:
from spacy.lang.en import English

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

filtered_tokens=[]

# filtering stop words and punctuations
for word in doc:
    if word.is_stop==False:
        if word.is_punct==False:
            filtered_tokens.append(word)

print("Filtered Sentence:",filtered_tokens)

Filtered Sentence: [learning, data, science, discouraged, Challenges, setbacks, failures, journey, got]


## Lemmatization

In [26]:
# Implementing lemmatization
lem = nlp("run runs running runner")

# finding lemma for each word
for word in lem:
    print(word.text,"==>" ,word.lemma_)

run ==> 
runs ==> 
running ==> 
runner ==> 


In [27]:
# importing the model en_core_web_sm of English for vocabluary, syntax & entities
import en_core_web_sm

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load()

# Implementing lemmatization
lem = nlp("run runs running runner")

# finding lemma for each word
for word in lem:
    print(word.text,"==>" ,word.lemma_)

run ==> run
runs ==> run
running ==> run
runner ==> runner


In [31]:
# importing the model en_core_web_sm of English for vocabluary, syntax & entities
import en_core_web_sm

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load()

text = """When learning data science, you shouldn't get discouraged! Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

filtered_tokens=[]

# filtering stop words and punctuations
for word in doc:
    if word.is_stop==False:
        if word.is_punct==False:
            filtered_tokens.append(word)

print("Filtered Tokens:",filtered_tokens)

normalized_tokens=[]
for token in filtered_tokens:
    normalized_tokens.append(token.lemma_)
    
print("Lemmatized Tokens:",normalized_tokens)

Filtered Tokens: [learning, data, science, discouraged, Challenges, setbacks, failures, journey, got]
Lemmatized Tokens: ['learn', 'data', 'science', 'discourage', 'challenge', 'setback', 'failure', 'journey', 'get']


## PoS Tagging

In [32]:
# POS tagging

# importing the model en_core_web_sm of English for vocabluary, syntax & entities
import en_core_web_sm   

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load()  

# "nlp" Objectis used to create documents with linguistic annotations.
docs = nlp(u"All is well that ends well.")

for word in docs:
    print(word.text,word.pos_)

All DET
is AUX
well ADJ
that DET
ends VERB
well ADV
. PUNCT


## Named Entity Detection

In [49]:
# importing the model en_core_web_sm of English for vocabluary, syntax & entities
import en_core_web_sm   

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load() 

nytimes= nlp(u"""New York City on Tuesday declared a public health emergency and ordered mandatory measles vaccinations amid an outbreak, becoming the latest national flash point over refusals to inoculate against dangerous diseases.At least 285 people have contracted measles in the city since September, mostly in Brooklyn’s Williamsburg neighborhood. The order covers four zip codes there, Mayor Bill de Blasio (D) said Tuesday.The mandate orders all unvaccinated people in the area, including a concentration of Orthodox Jews, to receive inoculations, including for children as young as 6 months old. Anyone who resists could be fined up to $1,000.""")

In [50]:
for i in nytimes.ents:
    print(i, i.label_)

New York City GPE
Tuesday DATE
At least 285 CARDINAL
September DATE
Brooklyn GPE
four CARDINAL
Bill de Blasio PERSON
Tuesday DATE
Orthodox NORP
6 months old DATE
up to $1,000 MONEY


In [51]:
entities=[(i, i.label_) for i in nytimes.ents]

print(entities)

[(New York City, 'GPE'), (Tuesday, 'DATE'), (At least 285, 'CARDINAL'), (September, 'DATE'), (Brooklyn, 'GPE'), (four, 'CARDINAL'), (Bill de Blasio, 'PERSON'), (Tuesday, 'DATE'), (Orthodox, 'NORP'), (6 months old, 'DATE'), (up to $1,000, 'MONEY')]


In [52]:
#for visualization of Entity detection importing displacy from spacy:
from spacy import displacy

displacy.render(nytimes, style = "ent",jupyter = True)

## Dependency Parsing

In [53]:
# importing the model en_core_web_sm of English for vocabluary, syntax & entities
import en_core_web_sm   

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load() 

#  "nlp" Object is used to create documents with linguistic annotations.
docp = nlp ("In pursuit of a wall, President Trump ran into one.")

for chunk in docp.noun_chunks:
   print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

pursuit pursuit pobj In
a wall wall pobj of
President Trump Trump nsubj ran


In [54]:
#for visualization of Entity detection importing displacy from spacy:
from spacy import displacy

displacy.render(docp, style="dep", jupyter= True)

## Word Vector Representation

In [55]:
# import en_core_web_sm small spacy model
import en_core_web_sm

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load()

#  "nlp" Object is used to create documents with linguistic annotations.
mango = nlp(u'mango')

print(mango.vector.shape)

(96,)


In [56]:
mango.vector

array([-0.462305  , -0.97013503, -0.3536405 ,  0.28740364, -0.01573728,
       -0.24513936, -1.215326  , -0.8796606 , -0.33882028, -0.85366464,
        1.1009694 , -0.40891293,  0.22952707,  0.32104927, -0.16520308,
        0.19346984,  0.18104246, -0.25050682, -0.86570626, -0.5158702 ,
        0.13842583, -1.1441295 , -1.2371405 , -0.31056306, -0.77198493,
       -0.7328714 ,  0.821449  ,  0.46671125,  0.46151486, -0.3285221 ,
        0.5737759 ,  0.5633069 ,  0.81746995, -0.1666174 , -0.31984073,
        0.10492463, -1.0577446 ,  0.35842416,  0.47972912, -0.29047596,
       -0.07571032,  1.112559  , -0.21457072,  1.0962675 , -0.1150732 ,
        0.00683655,  0.3471359 ,  0.7762994 , -0.18421805, -0.4036425 ,
        0.42345917,  0.25998825,  0.43403518, -0.3259907 ,  1.1417992 ,
       -0.21782616,  0.6406765 ,  0.25259757, -0.17306823, -0.4783872 ,
        0.89445263,  0.16820912, -0.01807833,  0.81608206, -0.42206132,
       -0.67849445,  0.26927558, -0.5631349 ,  0.6785864 ,  1.02

In [61]:
import en_core_web_sm

nlp = en_core_web_sm.load()

## try medium or large spacy english models

doc1 = nlp("I like apples.")

doc2 = nlp("I hate oranges.")

# cosine similarity
doc1.similarity(doc2) 

  doc1.similarity(doc2)


0.766287121319621