Split data in to Tokens

In [21]:
from spacy.lang.en import English

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

text = """We know that machines struggle to deal with raw text data. In fact, it’s almost impossible for machines to deal with anything except for numerical data. So representing text in the form of vectors has always been the most important step in almost all NLP tasks."""

#  "nlp" Object is used to create documents with linguistic annotations.
my_doc = nlp(text)

# Create list of word tokens
token_list = []
for token in my_doc:
    token_list.append(token.text)
print(token_list)

['We', 'know', 'that', 'machines', 'struggle', 'to', 'deal', 'with', 'raw', 'text', 'data', '.', 'In', 'fact', ',', 'it', '’s', 'almost', 'impossible', 'for', 'machines', 'to', 'deal', 'with', 'anything', 'except', 'for', 'numerical', 'data', '.', 'So', 'representing', 'text', 'in', 'the', 'form', 'of', 'vectors', 'has', 'always', 'been', 'the', 'most', 'important', 'step', 'in', 'almost', 'all', 'NLP', 'tasks', '.']


Split data in to Sentences

In [22]:
nlp = English()

# Create the pipeline 'sentencizer' component
#sbd = nlp.create_pipe('sentencizer')

# Add the component to the pipeline
nlp.add_pipe('sentencizer')

text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# create list of sentence tokens
sents_list = []
for sent in doc.sents:
    sents_list.append(sent.text)
print(sents_list)

["When learning data science, you shouldn't get discouraged!", "\nChallenges and setbacks aren't failures, they're just part of the journey.", "You've got this!"]


Cleaning data by Removing Stopwords

In [38]:
import spacy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

#Printing the total number of stop words:
print('Number of stop words: %d' % len(spacy_stopwords))

#Printing first ten stop words:
print('First ten stop words: %s' % list(spacy_stopwords)[:20])

Number of stop words: 326
First ten stop words: ['‘re', 'even', 'everything', 'all', 'have', '’d', 'unless', 'hundred', 'few', 'as', 'might', 'full', 'moreover', 'besides', 'five', 'wherein', 'nevertheless', 'using', 'yours', 'almost']


In [39]:
from spacy.lang.en.stop_words import STOP_WORDS

#Implementation of stop words:
filtered_sent=[]

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# filtering stop words
for word in doc:
    if word.is_stop==False:
        filtered_sent.append(word)
print("Filtered Sentence:",filtered_sent)

Filtered Sentence: [learning, data, science, ,, discouraged, !, 
, Challenges, setbacks, failures, ,, journey, ., got, !]


Lexicon Normalization
Lemmatization : Processing words convert them to roots 

In [40]:
lem = nlp("study eats eating studying")
# finding lemma for each word
for word in lem:
    print(word.text,word.lemma_)

study study
eats eat
eating eat
studying study


Part of Speech (POS) Tagging

In [41]:
import en_core_web_sm

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load()

#  "nlp" Objectis used to create documents with linguistic annotations.
docs = nlp(u"We know that machines struggle to deal with raw text data. In fact, it’s almost impossible for machines to deal with anything except for numerical data. So representing text in the form of vectors has always been the most important step in almost all NLP tasks."""
)

for word in docs:
    print(word.text,word.pos_)

We PRON
know VERB
that SCONJ
machines NOUN
struggle VERB
to PART
deal VERB
with ADP
raw ADJ
text NOUN
data NOUN
. PUNCT
In ADP
fact NOUN
, PUNCT
it PRON
’s VERB
almost ADV
impossible ADJ
for SCONJ
machines NOUN
to PART
deal VERB
with ADP
anything PRON
except SCONJ
for ADP
numerical ADJ
data NOUN
. PUNCT
So ADV
representing VERB
text NOUN
in ADP
the DET
form NOUN
of ADP
vectors NOUN
has AUX
always ADV
been AUX
the DET
most ADV
important ADJ
step NOUN
in ADP
almost ADV
all PRON
NLP PROPN
tasks NOUN
. PUNCT


Entity Recognition (identifies places people, organizations and languages)
use DISPLACY visualizer for more visualization

In [42]:
from spacy import displacy

nytimes= nlp(u"""London City on Tuesday declared a public health emergency and ordered mandatory measles vaccinations amid an outbreak, becoming the latest national flash point over refusals to inoculate against dangerous diseases.

At least 595 people have contracted measles in the city since September, mostly in Brooklyn’s Williamsburg neighborhood. The order covers four Zip codes there, Mayor Bill de Blasio (D) said Tuesday.

The mandate orders all unvaccinated people in the area, including a concentration of Orthodox Jews, to receive inoculations, including for children as young as 6 months old. Anyone who resists could be fined up to $1,000.""")

entities=[(i, i.label_, i.label) for i in nytimes.ents]
entities
displacy.render(nytimes, style = "ent",jupyter = True)

In [43]:
Dependency Parsing(How individual words related to each other)

SyntaxError: invalid syntax (<ipython-input-43-06cd580ea11b>, line 1)

In [44]:
docp = nlp ("We know that machines struggle to deal with raw text data. In fact, it’s almost impossible for machines to deal with anything except for numerical data."""
)

for chunk in docp.noun_chunks:
   print(chunk.text, chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)
displacy.render(docp, style="dep", jupyter= True)

We We nsubj know
machines machines nsubj struggle
raw text data data pobj with
fact fact pobj In
it it nsubj ’s
machines machines nsubj deal
anything anything pobj with
numerical data data pobj for


Representaion of words in numbers called WORD VECTOR Transformation

In [45]:
import en_core_web_sm
nlp = en_core_web_sm.load()
software = nlp(u'software')
print(software.vector.shape)
print(software.vector)

(96,)
[-0.41731012 -0.78826296  0.40462655  0.22977209 -0.0106909  -0.31474382
 -0.08617902  0.22832012 -0.15342835  0.99731594 -0.48220646 -0.5561814
  0.33377352 -0.14201081  0.06877467  0.10804325 -0.9770094   0.5852082
  0.26322627  0.47614518 -0.3084299  -0.550299    0.45373642  0.01026541
  0.167604    0.48353547 -0.95421565 -1.1585242   0.6592267  -0.40608847
  1.0430351  -0.94181913  0.32891953 -0.02638385 -0.32583088 -0.6401886
 -0.36018214  0.0339711  -0.49718386  0.49242705 -0.05592465 -0.62209713
  0.28308567  1.1386962  -0.83685654 -0.8401963   0.38963374 -0.4084627
  2.0165818   0.5334693   0.00843003  0.26143983 -0.22679707 -0.31812215
  0.32522407  0.53762305 -0.99583673 -0.32038808  0.43765157 -0.03065774
 -0.24031067 -0.46443588  0.13583761 -0.48930722 -0.145504   -0.27535477
 -0.6754143  -0.05036486  0.08134395 -0.87394106  0.44734263 -0.6702021
 -0.04337108 -0.13673057  0.8626318  -0.6144985   0.1617612   1.5896024
  0.61101145 -0.95401216  1.133116    0.01036873  0