In [1]:
# Installation of nltk
# pip install nltk

### Text Preprocessing
Following code can be used for text preprocessing useful for various NLP applications.

First we need to import nltk

For a given text, we can do sentence tokenization and word tokenization using nltk library functions.
We can remove the punctuations using string library.

We can then remove stop words in English to get the important words in the text.

We also perform stemming and lemmatization. Stemming and Lemmatization are two different techniques that help reduce our data space. We don’t need to check every single form of a word for reducing the size of the big data corpus.

In [2]:
#import nltk library for using its different functions
import nltk
import string
import re
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/vysakh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/vysakh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vysakh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/vysakh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vysakh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
#  Sentence Tokenization  - Tokenizes sentences from text
from nltk.tokenize import sent_tokenize

In [4]:
# Word Tokenization  - Tokenizes words in sentences
from nltk.tokenize import word_tokenize

In [5]:
statement = "Microsoft is trying to buy France based startup at $7 Million. The quick brown fox, snatch the piece of cube from mouth of black crow.Tesla to build solar electric startup in gujrat for $70 million"

In [6]:
statement=statement.lower()
print("Lower: " + statement)

Lower: microsoft is trying to buy france based startup at $7 million. the quick brown fox, snatch the piece of cube from mouth of black crow.tesla to build solar electric startup in gujrat for $70 million


In [7]:
sentences = sent_tokenize(statement)
print(sentences)
words = word_tokenize(statement)
print(words)

['microsoft is trying to buy france based startup at $7 million.', 'the quick brown fox, snatch the piece of cube from mouth of black crow.tesla to build solar electric startup in gujrat for $70 million']
['microsoft', 'is', 'trying', 'to', 'buy', 'france', 'based', 'startup', 'at', '$', '7', 'million', '.', 'the', 'quick', 'brown', 'fox', ',', 'snatch', 'the', 'piece', 'of', 'cube', 'from', 'mouth', 'of', 'black', 'crow.tesla', 'to', 'build', 'solar', 'electric', 'startup', 'in', 'gujrat', 'for', '$', '70', 'million']


In [8]:
for sentence in sentences:
    print(sentence)

microsoft is trying to buy france based startup at $7 million.
the quick brown fox, snatch the piece of cube from mouth of black crow.tesla to build solar electric startup in gujrat for $70 million


In [9]:
 # Remove punctuations
for word in words:
    if word not in string.punctuation:
        print(word)

microsoft
is
trying
to
buy
france
based
startup
at
7
million
the
quick
brown
fox
snatch
the
piece
of
cube
from
mouth
of
black
crow.tesla
to
build
solar
electric
startup
in
gujrat
for
70
million


In [10]:
only_words=[w for w in words if not w in string.punctuation]
print(only_words)

['microsoft', 'is', 'trying', 'to', 'buy', 'france', 'based', 'startup', 'at', '7', 'million', 'the', 'quick', 'brown', 'fox', 'snatch', 'the', 'piece', 'of', 'cube', 'from', 'mouth', 'of', 'black', 'crow.tesla', 'to', 'build', 'solar', 'electric', 'startup', 'in', 'gujrat', 'for', '70', 'million']


In [11]:
#  Count Word Frequency

freq = nltk.FreqDist(only_words)
for key, val in freq.items():
    print(str(key) + ':' + str(val))


microsoft:1
is:1
trying:1
to:2
buy:1
france:1
based:1
startup:2
at:1
7:1
million:2
the:2
quick:1
brown:1
fox:1
snatch:1
piece:1
of:2
cube:1
from:1
mouth:1
black:1
crow.tesla:1
build:1
solar:1
electric:1
in:1
gujrat:1
for:1
70:1


In [12]:
#Removal of stop words from the text
from nltk.corpus import stopwords

In [13]:
# List of English stop words
english_stop_words=set(stopwords.words("english"))
print(english_stop_words)
print(len(english_stop_words))

{"that'll", 'couldn', "shouldn't", 'which', 'she', 't', 'just', 'a', "doesn't", "aren't", 'for', 'hers', 'him', 'an', "mightn't", 'does', 'only', 'am', 'from', 'further', 'can', 'will', 'haven', 'who', "haven't", "mustn't", "it's", 'be', 'again', 'all', "should've", 'o', 'they', 'above', 'most', 'very', "didn't", 'shouldn', 'yourself', 'don', 'ma', 'having', 'was', 'whom', 'herself', 'his', 'isn', 'me', 'because', 'them', 'i', "hadn't", 'is', 'too', 'did', 'than', 'yours', 'same', 'himself', 'their', 'nor', 'once', "don't", 'as', 'these', 're', "she's", 'her', "you'd", 's', 'against', 'its', 'each', 'out', 'until', 'but', 'before', 'the', 'below', 'theirs', 'when', 'over', 'should', 'here', 'at', 'into', 'some', 'ours', 'been', 'didn', 'll', "shan't", 'being', 'and', "hasn't", 'by', 'few', 'how', 'other', 'hadn', 'or', 'now', 'he', 'no', 'while', "you've", 'we', 'any', 'had', 've', 'ain', "couldn't", 'doing', 'aren', "needn't", 'it', 'then', 'your', "you're", "weren't", 'itself', 'abou

In [14]:
# Removal of stop words from the text
keywords=[w for w in only_words if not w in english_stop_words]
print(keywords)

['microsoft', 'trying', 'buy', 'france', 'based', 'startup', '7', 'million', 'quick', 'brown', 'fox', 'snatch', 'piece', 'cube', 'mouth', 'black', 'crow.tesla', 'build', 'solar', 'electric', 'startup', 'gujrat', '70', 'million']


### Lemmatization

Lemmatization in NLP is the process through which several different forms of the same word are mapped to one single form, which we can call the root form or the base form. In more technical terms, the root form is called a lemma. By reducing the number of forms a word can take, we make sure that we reduce our data space and that we don’t have to check every single form of a word. It helps us ignore morphological variations on a single word. Lemmatization brings context to the words.So it goes a steps further by linking words with similar meaning to one word. For example if a paragraph has words like cars, trains and automobile, then it will link all of them to automobile. In the below program we use the WordNet lexical database for lemmatization.

In [15]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
keywords=[w for w in keywords if w in wordnet_lemmatizer.lemmatize(w)]
print(keywords)
#Next find the roots of the word
#for w in keywords:
 #   lemmatized_words=wordnet_lemmatizer.lemmatize(w)
  #  print(wordnet_lemmatizer.lemmatize(w))

['microsoft', 'trying', 'buy', 'france', 'based', 'startup', '7', 'million', 'quick', 'brown', 'fox', 'snatch', 'piece', 'cube', 'mouth', 'black', 'crow.tesla', 'build', 'solar', 'electric', 'startup', 'gujrat', '70', 'million']


### Stemming

Stemming in NLP is the process of removing prefixes and suffixes from words so that they are reduced to simpler forms which are called stems. The purpose of stemming is to reduce our vocabulary and dimensionality for NLP tasks and to improve speed and efficiency in information retrieval and information processing tasks. Stemming is a simpler, faster process than lemmatization. The difference is that stemming is usually only rule-based approach. And, as we've showed with our earlier example, rule-based approaches can fail very quickly on more complex examples. But for most problems, it works well enough. Many search engines use stemming to improve their search results.


In [16]:
# Stemming
from nltk.stem import PorterStemmer

In [17]:
porter_stemmer = PorterStemmer()

#Next find the roots of the word
for w in keywords:
      print(porter_stemmer.stem(w))

microsoft
tri
buy
franc
base
startup
7
million
quick
brown
fox
snatch
piec
cube
mouth
black
crow.tesla
build
solar
electr
startup
gujrat
70
million


In [18]:
# POS Tagging

In [19]:
print(nltk.pos_tag(keywords))

[('microsoft', 'JJ'), ('trying', 'VBG'), ('buy', 'NN'), ('france', 'NN'), ('based', 'VBN'), ('startup', '$'), ('7', 'CD'), ('million', 'CD'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('snatch', 'VBP'), ('piece', 'NN'), ('cube', 'NN'), ('mouth', 'JJ'), ('black', 'JJ'), ('crow.tesla', 'NN'), ('build', 'VB'), ('solar', 'JJ'), ('electric', 'JJ'), ('startup', 'NN'), ('gujrat', '$'), ('70', 'CD'), ('million', 'CD')]


In [20]:
# Installation of spacy
# pip install spacy

In [21]:
# !python3 -m spacy download en_core_web_sm

In [22]:
import spacy
from spacy import displacy
npl = spacy.load('en_core_web_sm')

In [23]:
doc1 = npl("Apple is looking at buying U.K. startup for $1 billion")

for token in doc1:
    print(token.text,"-->", token.lemma_, ",",token.pos_,",", token.tag_,",", token.dep_,",",
          token.is_stop,"-->", spacy.explain(token.tag_))
#SpaCy does not provide a built-in function for Stemming

Apple --> Apple , PROPN , NNP , nsubj , False --> noun, proper singular
is --> be , AUX , VBZ , aux , True --> verb, 3rd person singular present
looking --> look , VERB , VBG , ROOT , False --> verb, gerund or present participle
at --> at , ADP , IN , prep , True --> conjunction, subordinating or preposition
buying --> buy , VERB , VBG , pcomp , False --> verb, gerund or present participle
U.K. --> U.K. , PROPN , NNP , dobj , False --> noun, proper singular
startup --> startup , NOUN , NN , dep , False --> noun, singular or mass
for --> for , ADP , IN , prep , True --> conjunction, subordinating or preposition
$ --> $ , SYM , $ , quantmod , False --> symbol, currency
1 --> 1 , NUM , CD , compound , False --> cardinal number
billion --> billion , NUM , CD , pobj , False --> cardinal number


In [24]:
nouns = []
adjectives = []
for token in doc1:
    if token.pos_ == 'NOUN':
        nouns.append(token)
    if token.pos_ == 'ADJ':
        adjectives.append(token)
print(nouns,"\n",adjectives)

[startup] 
 []


In [25]:
displacy.render(doc1, style='dep', jupyter=True, options = {'distance':100})

In [26]:
for entity in doc1.ents:
  print(entity,":", entity.label_, ":",spacy.explain(entity.label_))

Apple : ORG : Companies, agencies, institutions, etc.
U.K. : GPE : Countries, cities, states
$1 billion : MONEY : Monetary values, including unit


In [27]:
displacy.render(doc1, style='ent', jupyter=True)

In [28]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
print(spacy_stopwords)
len(spacy_stopwords)

{'she', 'a', 'well', 'sometimes', 'therefore', '’m', 'seeming', 'from', 'fifty', 'who', 'often', 'all', 'be', 'nine', 'above', 'show', 'even', 'became', 'herself', 'among', 'upon', 'next', "'d", 'did', 'thereupon', 'yours', 'twenty', 'their', 'elsewhere', 'her', 'neither', 'bottom', 'but', 'hundred', 'almost', 'move', 'below', 'back', 'along', 'should', 'into', 'full', 'many', 'being', 'by', 'other', 'never', 'whither', 'now', 'becomes', 'he', 'unless', 'while', 'any', 'say', 'anyone', 'side', 'last', 'your', 'itself', 'about', 'might', 'moreover', '‘re', 'if', 'there', 'afterwards', 'keep', 'seem', 'that', 'are', 'my', 'have', 'always', 'this', 'whereby', 'less', 'anyhow', "n't", 'done', 'in', 'three', 'becoming', 'perhaps', 'n‘t', 'whether', 'hers', 'an', 'beyond', 'only', 'am', 'together', 'amongst', 'throughout', 'cannot', 'again', 'front', 'thence', 'somehow', 'yourself', 'anything', 'whom', 'may', 'his', 'six', 'call', 'either', 'is', 'within', 'forty', 'though', 'whenever', 'as'

326