In [1]:
import spacy

In [2]:
spacy.load('en_core_web_sm')

<spacy.lang.en.English at 0x20c733ec4a8>

In [4]:
nlp = spacy.load('en_core_web_sm')

### Read a document or text

In [9]:
## Reading a text
docx = nlp("spacy is a cool tool")

In [10]:
docx

spacy is a cool tool

In [11]:
## Reading a file
docx = nlp(u"spacy is a amazing tool")

In [12]:
docx

spacy is a amazing tool

In [15]:
myFile = open('text.txt').read()

In [19]:
doc_file = nlp(myFile)

In [20]:
for sentence in doc_file.sents:
    print(sentence)

this is the text file.
I can write anything in it and as fast as per my speed.
I want to improve my speed.
And I know if I practice a lot I can improve it.


### Sentence tokenization

In [23]:
for num,sentence in enumerate(doc_file.sents):
    print(f'{num}: {sentence}')

0: this is the text file.
1: I can write anything in it and as fast as per my speed.
2: I want to improve my speed.
3: And I know if I practice a lot I can improve it.


In [24]:
### Word Tokens

In [25]:
for token in docx:
    print(token.text)

spacy
is
a
amazing
tool


In [29]:
[token.text for token in docx]

['spacy', 'is', 'a', 'amazing', 'tool']

In [30]:
docx.text.split(" ")

['spacy', 'is', 'a', 'amazing', 'tool']

### More about words
    1. .shape = for shape of word eg. capital, lowercase etc
    2. .is_alpha = returns boolean if word is alphabet
    3. .is_stop = returns boolean if word is a stop word

In [32]:
for word in docx:
    print(word.text, word.shape_, word.is_alpha, word.is_stop)

spacy xxxx True False
is xx True True
a x True True
amazing xxxx True False
tool xxxx True False


### Parts of Speech Tagging
    1. NB attribute_ == returns readable string representation of attribute
    2. .pos
    3. .pos_ =  exposes Google Universal pos_tag, simple
    4. .tag
    5. .tag_ =  exposes Treebank, detailed, for training your own model
    6. Uses
    7. Sentiment Analysis, Homonym Disambuguity, Prediction

In [48]:
### parts of speech
ex1 = nlp("You drinks a drinks")

In [49]:
for word in ex1:
    print(word.text, word.pos, word.pos_, word.tag_)

You 94 PRON PRP
drinks 99 VERB VBZ
a 89 DET DT
drinks 91 NOUN NNS


In [51]:
### If you want to know the meaning of the pos abbreviation
print(spacy.explain('DT'))
print(spacy.explain('NNS'))
print(spacy.explain('VBP'))
print(spacy.explain('VBZ'))

determiner
noun, plural
verb, non-3rd person singular present
verb, 3rd person singular present


In [53]:
excercise1 = nlp(u"All the faith he had had had had no effect on the outcome of his life")

In [54]:
for word in excercise1:
    print((word.text, word.tag_, word.pos_))


('All', 'PDT', 'ADJ')
('the', 'DT', 'DET')
('faith', 'NN', 'NOUN')
('he', 'PRP', 'PRON')
('had', 'VBD', 'VERB')
('had', 'VBN', 'VERB')
('had', 'VBN', 'VERB')
('had', 'VBN', 'VERB')
('no', 'DT', 'DET')
('effect', 'NN', 'NOUN')
('on', 'IN', 'ADP')
('the', 'DT', 'DET')
('outcome', 'NN', 'NOUN')
('of', 'IN', 'ADP')
('his', 'PRP$', 'ADJ')
('life', 'NN', 'NOUN')


In [61]:
spacy.explain('VBD')

'verb, past tense'

### Syntatic Dependency
    It helps us to know the relation between tokens

In [63]:
ex3 = nlp("Sally likes Sam")


In [64]:
for word in ex3:
    print(word.text,word.tag_, word.pos_, word.dep_)

Sally NNP PROPN advmod
likes VBZ VERB ROOT
Sam NNP PROPN dobj


### Visualizing Dependency using displaCy
    from spacy import displacy
    displacy.server()
    displacy.render(jupyter=True)# for jupyter notebook

In [65]:
from spacy import displacy

In [67]:
displacy.render(ex3,jupyter=True)

### Lemmatizing
    Text Normalization
    Word inflection == syntatic differences between word forms
    reducing a word to its base/root form
    Lemmatization ==  word  based on its intended meaning
    Stemming == Cutting of the prefixes/suffices to reduce a word to base form

In [68]:
### Lemmatization

In [69]:
docx = nlp("study studying studious studio student")

In [72]:
for word in docx:
    print(word.text, word.lemma_, word.pos_, word.s)

study study NOUN
studying study VERB
studious studious ADJ
studio studio NOUN
student student NOUN


### Named Entity Recognition of Detection
    Classifying a text into predefined categories or real world object entities.
    takes a string of text as iput and identifies relevatn nouns (peope, places, and organization) that are mentioned in       that string.

### Uses
    Classifying or Categorizing contents by getting the relevant tags
    Improve search algorithms
    For content recommendations
    For info extraction
    
    .ents
    .label_
    
    

In [113]:
wikitext = nlp("Vishal Bahedia will be going to India on January 29th 2019. I will be back on 17th February 2019. I will eat Apple there which is worth 5 rupee.")

In [114]:
for word in wikitext.ents:
    print(word.text, word.label_)

Vishal Bahedia PERSON
India GPE
January 29th 2019 DATE
17th February 2019 DATE
Apple ORG
5 CARDINAL


In [115]:
spacy.explain('CARDINAL')

'Numerals that do not fall under another type'

In [116]:
spacy.explain('MONEY') 

'Monetary values, including unit'

In [117]:
displacy.render(wikitext, style="ent", jupyter=True)

In [119]:
spacy.explain('PERSON')

'People, including fictional'

### Visualizing ysing displaCy
    for REPL
    for Jupyter notebook
    from spacy import displacy
    displacy.server()
    displacy.render(jupyter=True) for jupyter notebook displayc

In [120]:
docx = nlp("The rat, the cat and the dog chased a lion and killed it.")

In [121]:
for word in docx:
    print(word.text, word.pos_, word.dep_)

The DET det
rat NOUN nsubj
, PUNCT punct
the DET det
cat NOUN conj
and CCONJ cc
the DET det
dog NOUN conj
chased VERB ROOT
a DET det
lion NOUN dobj
and CCONJ cc
killed VERB conj
it PRON dobj
. PUNCT punct


In [122]:
displacy.render(docx, jupyter=True)

In [125]:
spacy.explain('dep')

'unclassified dependent'

In [126]:
options = {'compact': True, 'bg': 'cornflowerblue', 'color':'#fff', 'font':'Sans Serif'}

In [127]:
displacy.render(docx, jupyter=True, options=options)

### Semantic Similarity
    object1.similarity(object2)
    Uses:
        Recommendation systems
        Data Prrprocessing eg removing duplicates
        

In [128]:
doc1 = nlp("dog")

In [129]:
doc2 = nlp("cat")

In [130]:
doc2.similarity(doc1)

0.7344887830914172

In [133]:
ex1 = nlp("wolf dog cat bird fish")

In [134]:
for token1 in ex1:
    for token2 in ex1:
        print(token1.text, token2.text, "Similarity=" , token1.similarity(token2))

wolf wolf Similarity= 1.0
wolf dog Similarity= 0.5234998
wolf cat Similarity= 0.30953422
wolf bird Similarity= 0.52796584
wolf fish Similarity= 0.05131726
dog wolf Similarity= 0.5234998
dog dog Similarity= 1.0
dog cat Similarity= 0.6250718
dog bird Similarity= 0.4794653
dog fish Similarity= 0.32915172
cat wolf Similarity= 0.30953422
cat dog Similarity= 0.6250718
cat cat Similarity= 1.0
cat bird Similarity= 0.4474157
cat fish Similarity= 0.447517
bird wolf Similarity= 0.52796584
bird dog Similarity= 0.4794653
bird cat Similarity= 0.4474157
bird bird Similarity= 1.0
bird fish Similarity= 0.3541299
fish wolf Similarity= 0.05131726
fish dog Similarity= 0.32915172
fish cat Similarity= 0.447517
fish bird Similarity= 0.3541299
fish fish Similarity= 1.0


In [135]:
mylist = [(token1.text, token2.text, token1.similarity(token2)) for token2 in ex1 for token1 in ex1]

In [136]:
mylist

[('wolf', 'wolf', 1.0),
 ('dog', 'wolf', 0.5234998),
 ('cat', 'wolf', 0.30953422),
 ('bird', 'wolf', 0.52796584),
 ('fish', 'wolf', 0.05131726),
 ('wolf', 'dog', 0.5234998),
 ('dog', 'dog', 1.0),
 ('cat', 'dog', 0.6250718),
 ('bird', 'dog', 0.4794653),
 ('fish', 'dog', 0.32915172),
 ('wolf', 'cat', 0.30953422),
 ('dog', 'cat', 0.6250718),
 ('cat', 'cat', 1.0),
 ('bird', 'cat', 0.4474157),
 ('fish', 'cat', 0.447517),
 ('wolf', 'bird', 0.52796584),
 ('dog', 'bird', 0.4794653),
 ('cat', 'bird', 0.4474157),
 ('bird', 'bird', 1.0),
 ('fish', 'bird', 0.3541299),
 ('wolf', 'fish', 0.05131726),
 ('dog', 'fish', 0.32915172),
 ('cat', 'fish', 0.447517),
 ('bird', 'fish', 0.3541299),
 ('fish', 'fish', 1.0)]

In [137]:
import pandas as pd

In [138]:
df = pd.DataFrame(mylist)

In [139]:
df.head()

Unnamed: 0,0,1,2
0,wolf,wolf,1.0
1,dog,wolf,0.5235
2,cat,wolf,0.309534
3,bird,wolf,0.527966
4,fish,wolf,0.051317


In [140]:
df.corr()

Unnamed: 0,2
2,1.0


In [141]:
df.columns = ["token1", "token2" , "similarity"]

In [142]:
df.head()

Unnamed: 0,token1,token2,similarity
0,wolf,wolf,1.0
1,dog,wolf,0.5235
2,cat,wolf,0.309534
3,bird,wolf,0.527966
4,fish,wolf,0.051317


In [143]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline