### **Traditional (Rule-Based NLP using NLTK and Spacy)**

In [1]:
import nltk
import spacy

**1. Tokenization- The process of breaking a text down into tokens is called tokenization.**

In [None]:
#using spacy
nlp = spacy.load("en_core_web_sm")
text = "Mary, don’t slap the green witch"
print([str(token) for token in nlp(text.lower())])

['mary', ',', 'do', 'n’t', 'slap', 'the', 'green', 'witch']


In [None]:
#using nltk
from nltk.tokenize import word_tokenize
text = "Mary, don’t slap the green witch"
word_tokenize(text)


['Mary', ',', 'don', '’', 't', 'slap', 'the', 'green', 'witch']

In [None]:
#Tokenizing tweets using NLTK
from nltk.tokenize import TweetTokenizer
tweet="Snow White and the Seven Degrees #MakeAMovieCold@midnight: )"
tokenizer = TweetTokenizer()
print(tokenizer.tokenize(tweet.lower()))

['snow', 'white', 'and', 'the', 'seven', 'degrees', '#makeamoviecold', '@midnight', ':', ')']


 **2. Stopword Removal-Stopwords such as articles and
prepositions serve mostly a grammatical purpose, like filler holding the content words.**

In [None]:
#using spacy
nlp = spacy.load('en_core_web_sm')
stopwords = nlp.Defaults.stop_words
text = " we will show how to remove stopwords using spacy library"

lst=[]
for token in text.split():
    if token.lower() not in stopwords:    #checking whether the word is not
        lst.append(token)                    #present in the stopword list.
print(lst)

['remove', 'stopwords', 'spacy', 'library']


In [None]:
#using nltk
from nltk.corpus import stopwords
stopwords=stopwords.words('english')
text = " we will show how to remove stopwords using spacy library"

lst=[]
for token in text.split():
    if token.lower() not in stopwords:    #checking whether the word is not
        lst.append(token)                    #present in the stopword list.
print(lst)

['show', 'remove', 'stopwords', 'using', 'spacy', 'library']


**3. Unigrams, Bigrams, Trigrams, …, N-grams**
N grams are fixed length (n) consecutive token sequences occurring in the text. A bigram has two
tokens, a unigram one. Generating n grams from a text is straightforward enough, but packages like spaCy and NLTK provide convenient methods.

In [None]:
def n_grams(text, n):
  return [text[i:i+n] for i in range(len(text)-n+1)]
cleaned = ['mary', ',', "n't", 'slap', 'green', 'witch', '.']
print(n_grams(cleaned, 3))

[['mary', ',', "n't"], [',', "n't", 'slap'], ["n't", 'slap', 'green'], ['slap', 'green', 'witch'], ['green', 'witch', '.']]


In [None]:
#using spacy
import spacy
from spacy_ngram import NgramComponent

nlp = spacy.load('en_core_web_sm')  # or whatever model you downloaded
nlp.add_pipe('spacy-ngram')  # default to document-level ngrams, removing stopwords

text = 'Quark soup is an interacting localized assembly of quarks and gluons.'
print(nlp(text)._.ngram_1)
print(nlp(text)._.ngram_2)

['quark', 'soup', 'interact', 'localize', 'assembly', 'quark', 'gluon']
['quark_soup', 'soup_interact', 'interact_localize', 'localize_assembly', 'assembly_quark', 'quark_gluon']


In [None]:
#using NLTK
from nltk.util import ngrams
text = 'Quark soup is an interacting localized assembly of quarks and gluons.'

unigrams = ngrams(text.split(), 1)
for item in unigrams:
    print(item)
bigrams = ngrams(text.split(), 2)
for item in bigrams:
    print(item)

('Quark',)
('soup',)
('is',)
('an',)
('interacting',)
('localized',)
('assembly',)
('of',)
('quarks',)
('and',)
('gluons.',)
('Quark', 'soup')
('soup', 'is')
('is', 'an')
('an', 'interacting')
('interacting', 'localized')
('localized', 'assembly')
('assembly', 'of')
('of', 'quarks')
('quarks', 'and')
('and', 'gluons.')


**4. Lemmas and Stems**

In [None]:
#Lemmatization Using Spacy
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"he was running late")
for token in doc:
  print('{} > {}'.format(token, token.lemma_))
#There is no stemming method in Spacy

he > he
was > be
running > run
late > late


In [None]:
#Lemmatization using NLTK
from nltk.stem import WordNetLemmatizer
wn=WordNetLemmatizer()
doc ="he was running late"
[wn.lemmatize(word) for word in doc.split()]


['he', 'wa', 'running', 'late']

In [None]:
#Lemmatization using NLTK
from nltk.stem import PorterStemmer
ps=PorterStemmer()
doc ="he was running late"
[ps.stem(word) for word in doc.split()]


['he', 'wa', 'run', 'late']

**5. Categorizing words: POS Tagging**

In [None]:
#using spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Mary slapped the green witch.")
for token in doc:
  print('{} {}'.format(token, token.pos_))

Mary PROPN
slapped VERB
the DET
green ADJ
witch NOUN
. PUNCT


In [None]:
#using NLTK
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
doc = "Mary slapped the green witch."
pos_tag(word_tokenize(doc))

[('Mary', 'NNP'),
 ('slapped', 'VBD'),
 ('the', 'DT'),
 ('green', 'JJ'),
 ('witch', 'NN'),
 ('.', '.')]

**6. Categorizing Spans: Chunking and Named Entity Recognition**

**Chunking:** Often, we need to label a span of text; that is, a contiguous multitoken boundary. For example,
consider the sentence, “Mary slapped the green witch.” We might want to identify the noun phrases
(NP) and verb phrases (VP) in it,

In [6]:
#using spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(u"Mary slapped the green witch.")
for chunk in doc.noun_chunks:
  print ('{} {}'.format(chunk, chunk.label_))

Mary NP
the green witch NP


**Named Entity**:  A named entity is a string mention of a real
world concept like a person, location, organization, drug name, and so on.

In [7]:
#using spacy
doc = nlp("Larry Page founded Google")
# Text and label of named entity span
[(ent.text, ent.label_) for ent in doc.ents]


[('Larry Page', 'PERSON'), ('Google', 'ORG')]

In [17]:
#using NLTK
import nltk
from nltk import word_tokenize,pos_tag
from nltk import ne_chunk
text = "NASA awarded Elon Musk’s SpaceX a $2.9 billion contract to build the lunar lander."
tokens = word_tokenize(text)
tag=pos_tag(tokens)
print(tag)

ne_tree = nltk.ne_chunk(tag)
print(ne_tree)

[('NASA', 'NNP'), ('awarded', 'VBD'), ('Elon', 'NNP'), ('Musk', 'NNP'), ('’', 'NNP'), ('s', 'VBD'), ('SpaceX', 'NNP'), ('a', 'DT'), ('$', '$'), ('2.9', 'CD'), ('billion', 'CD'), ('contract', 'NN'), ('to', 'TO'), ('build', 'VB'), ('the', 'DT'), ('lunar', 'NN'), ('lander', 'NN'), ('.', '.')]
(S
  (ORGANIZATION NASA/NNP)
  awarded/VBD
  (PERSON Elon/NNP Musk/NNP)
  ’/NNP
  s/VBD
  (ORGANIZATION SpaceX/NNP)
  a/DT
  $/$
  2.9/CD
  billion/CD
  contract/NN
  to/TO
  build/VB
  the/DT
  lunar/NN
  lander/NN
  ./.)
