# Module 1 Get Started on spaCy

In [None]:
import spacy

## Load Language Model

In [None]:
nlp = spacy.load("en")

# Module 2 Linguistic Features

## Process text/corpus with nlp

In [None]:
text = """Dostoevsky was the son of a doctor. 
His parents were very hard-working and deeply religious people,
but so poor that they lived with their five children in only
two rooms. The father and mother spent their evenings
in reading aloud to their children, generally from books of
a serious character."""

In [None]:
doc = nlp(text)
doc

In [None]:
# Get text from file

text = open('sample.txt').read()
text

In [None]:
doc = nlp(text)
doc

## Tokenization


In [None]:
# Word Tokenization

doc = nlp(u'Today is a great day')
[word.text for word in doc]

In [None]:
# text = open('sample.txt').read()
# doc = nlp(text)

word_tokens = [token.text for token in doc]
word_tokens

In [None]:
# Sentence Tokenization

doc = nlp(u"This is a sentence. This is another sentence.")
for sent in doc.sents:
    print(sent.text)

In [None]:

text = open('sample.txt').read()
doc = nlp(text)
for sent in doc.sents:
    print(sent.text)


## Stop Words

In [None]:
doc[1].is_stop

In [None]:
[word.is_stop for word in doc]

In [None]:
nlp.vocab["the"].is_stop = True

In [None]:
[word.is_stop for word in doc]

## Lemmatization

In [None]:
# doc = nlp(u'running run')
doc = nlp(u'meaning mean')
doc = nlp(u'meanness meaning mean')

for token in doc:
	print(token.text,token.lemma_)

In [None]:
doc = nlp(u"Apples and oranges are similar. Boots and hippos aren't.")

for token in doc:
    print(token, token.lemma_)

In [None]:
# text = open('sample.txt').read()
# doc = nlp(text)

for token in doc:
    print(token, token.lemma_)

## Part of Speech (POS)

In [39]:
# text = open('sample.txt').read()
# doc = nlp(text)

doc = nlp(u'The cat sit on the mat')
for token in doc:
    print(token.text,token.pos_)

The DET
cat NOUN
sit VERB
on ADP
the DET
mat NOUN


In [40]:

for token in doc:
    print(token.text,token.tag_)

The DT
cat NN
sit VBP
on IN
the DT
mat NN


## Noun Chuck

In [42]:
# text = open('sample.txt').read()
# doc = nlp(text)

doc = nlp(u'The cat sit on the mat')
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

The cat cat nsubj sit
the mat mat pobj on


In [43]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)

Autonomous cars cars nsubj shift
insurance liability liability dobj shift
manufacturers manufacturers pobj toward


##  Visualize Dependency and POS 

In [None]:
from spacy import displacy

doc = nlp(u'The cat sit on the mat')
displacy.serve(doc, style='dep',displacy.serve(doc, style='dep')

In [None]:
from spacy import displacy

doc = nlp(u'The cat sit on the mat')
options = {'compact': True, 'bg': '#09a3d5',
           'color': 'white', 'font': 'Source Sans Pro'}
displacy.serve(doc, style='dep', options=options)

In [None]:
from spacy import displacy

doc = nlp(u'The cat sit on the mat')
html = displacy.render([doc], style='dep', jupyter=True)

## Name Entity Recognition (NER)

In [None]:
doc = nlp(u'Lee Kuan Yew is the prime minister for Singapore')

doc.ents

In [None]:
from spacy import displacy

doc = nlp(u'Lee Kuan Yew is the prime minister for Singapore')

doc.user_data['title'] = 'This is a title'
displacy.serve(doc, style='ent')

In [None]:
from spacy import displacy

doc = nlp(u'Lee Kuan Yew is the prime minister for Singapore')

colors = {'ORG': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)'}
options = {'ents': ['ORG'], 'colors': colors}
displacy.serve(doc, style='ent', options=options)

In [None]:
from spacy import displacy

doc = nlp(u'Lee Kuan Yew is the prime minister for Singapore')

html = displacy.render([doc], style='ent', jupyter=True)

In [None]:
from spacy import displacy

doc = nlp(u'Lee Kuan Yew is the prime minister for Singapore')

colors = {'ORG': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)'}
options = {'ents': ['ORG'], 'colors': colors}
html = displacy.render([doc], style='ent', jupyter=True,options=options)

# Module 3 Processing Pipelines

## Default Pipeline

In [None]:
nlp.pipeline

In [None]:
nlp.pipe_names

## Disable Components

In [None]:
nlp = spacy.load('en', disable=['parser', 'tagger'])
doc = nlp(u'The cat sit on the mat')
for token in doc:
    print(token.text,token.pos_)

In [None]:
nlp = spacy.load('en')
doc = nlp(u'The cat sit on the mat')
for token in doc:
    print(token.text,token.pos_)

In [None]:
nlp = spacy.load('en')
nlp.remove_pipe('tagger')
doc = nlp(u'The cat sit on the mat')
for token in doc:
    print(token.text,token.pos_)


## Rename Components

In [None]:
nlp = spacy.load('en')
nlp.rename_pipe('ner', 'entityrecognizer')
nlp.pipe_names

## Adding Custom Component

In [None]:
def my_component(doc):
    print("After tokenization, this doc has %s tokens." % len(doc))
    if len(doc) < 10:
        print("This is a pretty short document.")
    return doc



In [None]:
nlp = spacy.load('en')
nlp.add_pipe(my_component, name='print_info', first=True)
nlp.pipe_names


In [None]:
doc = nlp(u'The cat sit on the mat')
doc

# Module 4 Vectors & Similarity

## Gensim Word2Vec

In [None]:
from gensim.models import Word2Vec
from nltk.corpus import gutenberg

embedding = Word2Vec(gutenberg.sents(),min_count=1, window=5, size=32)

print(embedding.most_similar('man', topn=5))
print(embedding.most_similar('woman', topn=5))


## Similarity

In [37]:
nlp = spacy.load('en_core_web_md')
tokens = nlp(u'dog cat banana afskfsd')

for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

dog dog 1.0
dog cat 0.80168533
dog banana 0.24327648
dog afskfsd 0.0
cat dog 0.80168533
cat cat 1.0
cat banana 0.28154367
cat afskfsd 0.0
banana dog 0.24327646
banana cat 0.2815437
banana banana 1.0
banana afskfsd 0.0
afskfsd dog 0.0
afskfsd cat 0.0
afskfsd banana 0.0
afskfsd afskfsd 1.0


## Word Vector

In [38]:
nlp = spacy.load('en_core_web_md')

tokens = nlp(u'dog cat banana afskfsd')
tokens[1].vector

array([-0.15067  , -0.024468 , -0.23368  , -0.23378  , -0.18382  ,
        0.32711  , -0.22084  , -0.28777  ,  0.12759  ,  1.1656   ,
       -0.64163  , -0.098455 , -0.62397  ,  0.010431 , -0.25653  ,
        0.31799  ,  0.037779 ,  1.1904   , -0.17714  , -0.2595   ,
       -0.31461  ,  0.038825 , -0.15713  , -0.13484  ,  0.36936  ,
       -0.30562  , -0.40619  , -0.38965  ,  0.3686   ,  0.013963 ,
       -0.6895   ,  0.004066 , -0.1367   ,  0.32564  ,  0.24688  ,
       -0.14011  ,  0.53889  , -0.80441  , -0.1777   , -0.12922  ,
        0.16303  ,  0.14917  , -0.068429 , -0.33922  ,  0.18495  ,
       -0.082544 , -0.46892  ,  0.39581  , -0.13742  , -0.35132  ,
        0.22223  , -0.144    , -0.048287 ,  0.3379   , -0.31916  ,
        0.20526  ,  0.098624 , -0.23877  ,  0.045338 ,  0.43941  ,
        0.030385 , -0.013821 , -0.093273 , -0.18178  ,  0.19438  ,
       -0.3782   ,  0.70144  ,  0.16236  ,  0.0059111,  0.024898 ,
       -0.13613  , -0.11425  , -0.31598  , -0.14209  ,  0.0281

In [None]:
for token in tokens:
    print(token.text, token.vector_norm)

# Module 5 Machine Learning using spaCy

In [1]:
import pandas as pd


dfdf  ==  pdpd..read_csvread_csv('research_paper.csv')
df.head()

SyntaxError: invalid syntax (<ipython-input-1-0e58a46824a7>, line 3)