# Installing the libraries

- spaCy: https://spacy.io/

In [1]:
%%capture
!pip install spacy --upgrade

In [2]:
import numpy as np
import spacy

print(spacy.__version__)

3.2.2


In [6]:
#!python -m spacy donwload pt_core_web_sm
#!python -m spacy donwload fr_core_web_sm
%%capture
!python -m spacy download en_core_web_sm

# POS (part-of-speech)

- POS (part-of-speech): noun, adjective, verb
- It is important to find named entities
- Tags: https://ashutoshtripathi.com/2020/04/13/parts-of-speech-tagging-and-dependency-parsing-using-spacy-nlp/

In [9]:
nlp = spacy.load("en_core_web_sm")
nlp

<spacy.lang.en.English at 0x7fdb8dbd8ed0>

In [8]:
doc = nlp("I am learning natural language processing. The course is in London")

## Legend

- lemma: "root" of the word
- pos: part-of-speech  
- tag: morfological information (present, future, past)
- dep: syntatic dependency
- shape: lowercase, uppercasa
- alpha: if it is alphanumeric
- stop: if it is a stop word

In [15]:
for token in doc:
    print(token.text,token.pos_,token.lemma_,token.tag_,token.dep_,token.shape_,token.is_alpha,token.is_stop)

I PRON I PRP nsubj X True True
am AUX be VBP aux xx True True
learning VERB learn VBG ROOT xxxx True False
natural ADJ natural JJ amod xxxx True False
language NOUN language NN compound xxxx True False
processing NOUN processing NN dobj xxxx True False
. PUNCT . . punct . False False
The DET the DT det Xxx True True
course NOUN course NN nsubj xxxx True False
is AUX be VBZ ROOT xx True True
in ADP in IN prep xx True True
London PROPN London NNP pobj Xxxxx True False


In [18]:
for token in doc:
    if token.pos_ == "PROPN":
        print(token.text)

London


In [19]:
for token in doc:
    if token.pos_ == "VERB":
        print(token.text)

learning


# Lemmatization and stemming

- Lemmatization: meaning of the word based on the dictionary (morphological analysis) - extract the base word
- Stemming: extract the root of the word

In [27]:
import nltk
stemmer = nltk.stem.PorterStemmer()

In [30]:
for token in doc:
    print(f"Original : {token.text:10}", f"Lemmatization : {token.lemma_:10}", f"Stemming : {stemmer.stem(token.text)}")

Original : I          Lemmatization : I          Stemming : I
Original : am         Lemmatization : be         Stemming : am
Original : learning   Lemmatization : learn      Stemming : learn
Original : natural    Lemmatization : natural    Stemming : natur
Original : language   Lemmatization : language   Stemming : languag
Original : processing Lemmatization : processing Stemming : process
Original : .          Lemmatization : .          Stemming : .
Original : The        Lemmatization : the        Stemming : the
Original : course     Lemmatization : course     Stemming : cours
Original : is         Lemmatization : be         Stemming : is
Original : in         Lemmatization : in         Stemming : in
Original : London     Lemmatization : London     Stemming : london


In [31]:
doc1 = nlp("learn learning watch watching watched")
[token.lemma_ for token in doc1],[stemmer.stem(token.text) for token in doc1]

(['learn', 'learn', 'watch', 'watch', 'watch'],
 ['learn', 'learn', 'watch', 'watch', 'watch'])

# Named-entity recognition (NER)

- List of tags: https://towardsdatascience.com/named-entity-recognition-ner-using-spacy-nlp-part-4-28da2ece57c6

In [46]:
doc2 = nlp("IBM is a US company on information technology. Is is located in San Franscisco and revenue in 2018 was approximately 320 billion dolars")

In [47]:
for ent in doc2.ents:
    print(f"Original Entity: {ent.text:20}", f"Label : {ent.label_:10}")

Original Entity: IBM                  Label : ORG       
Original Entity: US                   Label : GPE       
Original Entity: San Franscisco       Label : GPE       
Original Entity: 2018                 Label : DATE      
Original Entity: approximately 320 billion Label : MONEY     


In [48]:
from spacy import displacy
displacy.render(doc2,style = "ent",jupyter = True)

In [51]:
doc3 = nlp("Bill Gates was born in Seattle on 1955-10-28 and is the founder of Microsoft")
displacy.render(doc3, style = "ent",jupyter = True)

In [52]:
for ent in doc3.ents:
    print(f"Original Entity: {ent.text:20}", f"Label : {ent.label_:10}")

Original Entity: Bill Gates           Label : PERSON    
Original Entity: Seattle              Label : GPE       
Original Entity: 1955-10-28           Label : DATE      
Original Entity: Microsoft            Label : ORG       


In [53]:
for ent in doc3.ents:
    if ent.label_ == "PERSON":
        print(f"Original Entity: {ent.text:20}", f"Label : {ent.label_:10}")

Original Entity: Bill Gates           Label : PERSON    


# StopWords

- Words that appear very often and don't help to understand the context of the document

In [54]:
from spacy.lang.en.stop_words import STOP_WORDS

print(STOP_WORDS)

{'yet', 'with', 'beforehand', 'one', 'everywhere', 'whence', 'call', 'will', 'during', 'namely', 'nine', 'n‘t', 'less', 'again', 'first', 'your', '‘d', 'noone', 'nowhere', 'had', 'below', "'ll", 'whatever', 'thereby', 'doing', 'itself', 'thus', "'re", 'really', '’ll', 'show', 'yourselves', 'also', 'ca', 'take', 'its', 'everything', 'anyhow', 'eleven', 're', 'over', 'since', 'been', 'before', 'even', 'on', 'therefore', 'made', 'being', 'a', 'he', 'within', '’m', 'never', '‘m', 'every', 'except', 'whoever', 'became', 'six', 'please', 'for', 'n’t', 'yours', 'onto', 'these', '‘ve', 'however', 'mine', 'many', 'though', 'others', 'part', 'whether', 'once', 'not', 'still', 'go', 'unless', 'through', 'otherwise', 'so', 'last', 'all', 'five', 'how', 'own', '’ve', 'after', 'sometime', 'should', 'more', 'something', 'such', 'former', 'be', 'here', 'various', 'wherein', 'thence', 'see', 'seems', 'seemed', 'anyone', 'themselves', 'each', 'this', 'whereafter', 'where', 'either', 'hereby', '‘s', 'not

In [58]:
len(STOP_WORDS)

326

In [56]:
"it" in STOP_WORDS

True

In [59]:
nlp.vocab["it"].is_stop,nlp.vocab["walk"].is_stop

(True, False)

In [61]:
doc4 = nlp("I am learning natural language processing. The course is in London")
for token in doc:
    if nlp.vocab[token.text].is_stop:
        print(token.text)

I
am
The
is
in


In [62]:
doc4 = nlp("I am learning natural language processing. The course is in London")
for token in doc:
    if not nlp.vocab[token.text].is_stop:
        print(token.text)

learning
natural
language
processing
.
course
London


# Dependency parsing

- Parent-child relation

In [70]:
doc5 = nlp("book a ticket from London to Paris")
for token in doc5:
        print(f"Original Entity: {token.text:20}", f"Dep : {token.dep_:10}")

Original Entity: book                 Dep : ROOT      
Original Entity: a                    Dep : det       
Original Entity: ticket               Dep : dobj      
Original Entity: from                 Dep : prep      
Original Entity: London               Dep : pobj      
Original Entity: to                   Dep : prep      
Original Entity: Paris                Dep : pobj      


In [71]:
origin = doc5[4]
destiny = doc5[6]
print(origin,destiny)

London Paris


In [72]:
# ancestors -- > Atalar
list(origin.ancestors),list(destiny.ancestors)

([from, ticket, book], [to, book])

---

In [75]:
doc5[0],doc5[2],doc5[0].is_ancestor(doc5[2])

(book, ticket, True)

In [91]:
doc6 = nlp("book a table for the restaurant and a taxi to the hotel")

tasks = doc6[2],doc6[8]
locations = doc6[5],doc6[11]

print(tasks,locations)

(table, taxi) (restaurant, hotel)


In [92]:
for local in locations:
    print("-----",local,"-----")
    for obj in local.ancestors:
        print(obj)

----- restaurant -----
for
table
book
----- hotel -----
to
taxi
restaurant
for
table
book


In [88]:
for local in locations:
    for obj in local.ancestors:
        if obj in tasks:
            print("Reservation of a {} to the {}".format(obj,local))
            break

Reservation of a table to the restaurant
Reservation of a taxi to the hotel


In [90]:
doc6[5],list(doc6[5].children)

(restaurant, [the, and, taxi])

---

In [93]:
from spacy import displacy

In [94]:
doc6

book a table for the restaurant and a taxi to the hotel

In [96]:
# !!!
doc6[5],list(doc6[5].children)

(restaurant, [the, and, taxi])

In [97]:
displacy.render(doc6, style = "dep",jupyter = True,options = {"distance":90})

In [103]:
doc6[2],list(doc6[2].ancestors)

(table, [book])

In [101]:
doc6[2],list(doc6[2].children)

(table, [a, for])

---

In [104]:
doc = nlp("What places can we visit in London and stay in Paris?")
locations = doc[6],doc[10]
actions = doc[4],doc[8]
print(locations,actions)

(London, Paris) (visit, stay)


In [112]:
[i for i in local.ancestors for local in locations],[i for i in local.ancestors for local in actions]

([in, in, stay, stay, visit, visit], [in, in, stay, stay, visit, visit])

In [106]:
for local in locations:
    #print(local)
    for action in local.ancestors:
        if action in actions:
            print("{} to {}".format(local,action))
            break

London to visit
Paris to stay


In [113]:
displacy.render(doc, style = "dep",jupyter = True,options = {"distance":90})

In [123]:
# doc[4] --> visit
# doc[-4] --> stay

[i for i in doc[4].ancestors],[i for i in doc[-4].ancestors],list(doc[-4].children)

([], [visit], [in])

# Similarity between words and sentences

- spaCy uses the GloVe algorithm (Global Vectors for Word Representation)
- Original paper: https://nlp.stanford.edu/pubs/glove.pdf

In [126]:
from warnings import filterwarnings
filterwarnings("ignore")

w1 = nlp("hello")
w2 = nlp("hi")
w3 = nlp("or")

In [128]:
w1.similarity(w2),w2.similarity(w1)

(0.6864077474207763, 0.6864077474207763)

In [129]:
w1.similarity(w3)

-0.030911659914285024

In [130]:
text1 = nlp('When will the new movie be released?')
text2 = nlp('The new movie will be released next month')
text3 = nlp('What color is the car?')

In [132]:
text1.similarity(text2),text1.similarity(text3)

(0.7480209003398514, 0.34528153350463103)

In [133]:
nlp("New York").similarity(nlp("Nw Yok"))

0.6932173297338927

---

In [134]:
text = nlp("cat dog horse person")

In [140]:
for token1 in text:
    print("----",token1,"----")
    for token2 in text:
        similarity = np.round(token1.similarity(token2) * 100,4)
        print("{} is {}% similar to {}".format(token1,similarity,token2))
    print()

---- cat ----
cat is 100.0% similar to cat
cat is 51.4044% similar to dog
cat is 62.119% similar to horse
cat is 35.2274% similar to person

---- dog ----
dog is 51.4044% similar to cat
dog is 100.0% similar to dog
dog is 57.2139% similar to horse
dog is 37.0158% similar to person

---- horse ----
horse is 62.119% similar to cat
horse is 57.2139% similar to dog
horse is 100.0% similar to horse
horse is 40.7988% similar to person

---- person ----
person is 35.2274% similar to cat
person is 37.0158% similar to dog
person is 40.7988% similar to horse
person is 100.0% similar to person



# Tokenization

In [141]:
document1 = nlp('I am learning natural language processing. The course is in London. Ph.d John is coming')
for token in document1:
    print(token)

I
am
learning
natural
language
processing
.
The
course
is
in
London
.
Ph.d
John
is
coming


In [142]:
# Ph.d !!!
document2 = 'I am learning natural language processing. The course is in London. Ph.d John is coming'
document2.split('.')

['I am learning natural language processing',
 ' The course is in London',
 ' Ph',
 'd John is coming']