## NLP text processing using NLTK

In [1]:
import nltk

In [2]:
story = open("story.txt",encoding="utf8").read()

In [3]:
len(story),type(story)

(754880, str)

In [4]:
## Tokenizing the paragraph's sentences
## Means convert paragraphs into sentences
sentence = nltk.sent_tokenize(story)

In [5]:
type(sentence)

list

In [6]:
len(sentence)

4660

In [7]:
sentence[1:2]

['However little known the feelings or views of such a man may be\n      on his first entering a neighbourhood, this truth is so well\n      fixed in the minds of the surrounding families, that he is\n      considered the rightful property of some one or other of their\n      daughters.']

In [8]:
## convert sentences into words
words = nltk.word_tokenize(story)

In [9]:
type(words),len(words)

(list, 142526)

In [10]:
words[0:5]

['\ufeff', 'Chapter', '1', 'It', 'is']

In [11]:
## Stemming and Lemmatization and stop words
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [12]:
story = open("story.txt",encoding="utf8").read()

In [13]:
## First convert paragraphs into sentences
sentences = nltk.sent_tokenize(story)

In [14]:
len(sentences)

4660

In [15]:
## Lets create steming object
stemming = PorterStemmer()

In [16]:
# I am going to remove stopwords and apply stemming word by word
for sent_index in range(len(sentences)):
    words = nltk.word_tokenize(sentences[sent_index])
    stem_words = [stemming.stem(word) for word in words if word not in set(stopwords.words("english"))]
    sentences[sent_index]=" ".join(stem_words)


In [17]:
len(sentences)

4660

In [18]:
sentences[1:2]

['howev littl known feel view man may first enter neighbourhood , truth well fix mind surround famili , consid right properti one daughter .']

we can see above that problem with stemmimng is that output words don't have any meaning 
e.g. howev but actual word is however and same littl and actual word is little
So we have another option to overcome this , we will use the lemmatization

In [19]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [20]:
story = open("story.txt",encoding="utf8").read()

In [21]:
sentences_lem = nltk.sent_tokenize(story)

In [22]:
len(sentences_lem)

4660

In [23]:
sentences_lem[1:2]

['However little known the feelings or views of such a man may be\n      on his first entering a neighbourhood, this truth is so well\n      fixed in the minds of the surrounding families, that he is\n      considered the rightful property of some one or other of their\n      daughters.']

In [24]:
lemmitizer = WordNetLemmatizer()

In [25]:
# I am going to remove stopwords and apply lemmatizer word by word
for sent_index in range(len(sentences_lem)):
    words = nltk.word_tokenize(sentences_lem[sent_index])
    lemmatizer_words = [lemmitizer.lemmatize(word) for word in words if word not in set(stopwords.words("english"))]
    sentences_lem[sent_index]=" ".join(lemmatizer_words)

In [26]:
sentences_lem[1:2]

['However little known feeling view man may first entering neighbourhood , truth well fixed mind surrounding family , considered rightful property one daughter .']

Now we can see above that problem with stemmimng is that output words dont have any meaning but using lammatization we have meaningfull words.

### Bag of Words

 Whenever we apply any algorithm in NLP, it works on numbers. We cannot directly feed our text into that algorithm. Hence, Bag of Words model is used to preprocess the text by converting it into a bag of words, which keeps a count of the total occurrences of most frequently used words. 
 
This model can be visualized using a table, which contains the count of words corresponding to the word itself.

In [27]:
import nltk
import re

In [28]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [29]:
story = open("story.txt",encoding="utf8").read()

In [30]:
len(story)

754880

In [31]:
story = story[0:4865]

In [32]:
sentences = nltk.sent_tokenize(story)

In [33]:
lemmatizer_bow = WordNetLemmatizer()

In [34]:
sentences_bow = []
for sent_index in range(len(sentences)):
    words = re.sub("[^a-zA-Z]"," ",sentences[sent_index])
    words = words.lower()
    words = words.split()
    words = [lemmatizer_bow.lemmatize(word) for word in words if word not in set(stopwords.words("english"))]
    words = " ".join(words)
    sentences_bow.append(words)

In [35]:
len(sentences_bow)

31

In [36]:
sentences[0:2]  ## original sentence

['\ufeff      Chapter 1\n\n      It is a truth universally acknowledged, that a single man in\n      possession of a good fortune, must be in want of a wife.',
 'However little known the feelings or views of such a man may be\n      on his first entering a neighbourhood, this truth is so well\n      fixed in the minds of the surrounding families, that he is\n      considered the rightful property of some one or other of their\n      daughters.']

In [37]:
## sentence after cleaning
sentences_bow[0:2]

['chapter truth universally acknowledged single man possession good fortune must want wife',
 'however little known feeling view man may first entering neighbourhood truth well fixed mind surrounding family considered rightful property one daughter']

In [38]:
## Lets create the BOW using sklearn
from sklearn.feature_extraction.text import CountVectorizer
## Now transfor the sentences_bow in to a vector
sentences_vector = CountVectorizer().fit_transform(sentences_bow).toarray()

In [39]:
sentences_vector.shape
## I have 31 sentences and total unique words is 225

(31, 225)

In [40]:
print(sentences_vector[0:2,:])
## we can see we get 1 corresponding to unique words in each sentences
## 1st sentences we have total 12 words and same we have total count of 1 is 12. 

[[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0
  0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0
  0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
  0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0
  1 0 0 0 0 0 0 0 0]]


Term frequencies are not necessarily the best representation for the text. Common words like "the", "a", "to" are almost always the terms with highest frequency in the text. Thus, having a high raw count does not necessarily mean that the corresponding word is more important. To address this problem, one of the most popular ways to "normalize" the term frequencies is to weight a term by the inverse of document frequency, or tf–idf. 

Also we BOW introduced limitations such as large feature dimension, sparse representation.

### TFIDF

TF-IDF, which stands for term frequency — inverse document frequency, is a scoring measure widely used in information retrieval (IR) or summarization. TF-IDF is intended to reflect how relevant a term is in a given document.

The intuition behind it is that if a word occurs multiple times in a document, we should boost its relevance as it should be more meaningful than other words that appear fewer times (TF). At the same time, if a word occurs many times in a document but also along many other documents, maybe it is because this word is just a frequent word; not because it was relevant or meaningful (IDF).

Defining what a “relevant word” means

We can come up with a more or less subjective definition driven by our intuition: a word’s relevance is proportional to the amount of information that it gives about its context (a sentence, a document or a full dataset). That is, the most relevant words are those that would help us, as humans, to better understand a whole document without reading it all.

How to Compute:
tf-idf is a weighting scheme that assigns each term in a document a weight based on its term frequency (tf) and inverse document frequency (idf). The terms with higher weight scores are considered to be more important.

Typically, the tf-idf weight is composed by two terms-

    Normalized Term Frequency (tf)
    Inverse Document Frequency (idf)


In [41]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [42]:
story = open("story.txt",encoding="utf8").read()

In [43]:
story = story[0:4865]

In [44]:
## Cleaning the story text
sentences = nltk.sent_tokenize(story)
lemmatizer_tfidf = WordNetLemmatizer()
sentences_tfidf = []
for sent_index in range(len(sentences)):
    words = re.sub("[^a-zA-Z]"," ",sentences[sent_index])
    words = words.lower()
    words = words.split()
    words = [lemmatizer_tfidf.lemmatize(word) for word in words if word not in set(stopwords.words("english"))]
    words = " ".join(words)
    sentences_tfidf.append(words)

In [45]:
## Lets create the TFIDF using sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
## Now transfor the sentences_tfidf in to a vector
sentences_vector = TfidfVectorizer().fit_transform(sentences_tfidf).toarray()

In [46]:
## sentence after cleaning
sentences_tfidf[0:1]

['chapter truth universally acknowledged single man possession good fortune must want wife']

In [47]:
print(sentences_vector[0:1,:])

[[0.         0.         0.33880088 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.33880088 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.27655214 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.27655214 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.   

Now we can see that using TFIDF we get different weight to each words although in BOW we have same type of weights.

Bag of words and TFIDF representation don’t consider the semantic relation between words, it just focus on count of word and neglect the arrangement in sentence.
To overcome this issue we can use word2vec model. We can refer this on my githublik
https://github.com/atulpatelDS/NLP

## Word2vec

In [48]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec

In [49]:
story = open("story.txt",encoding="utf8").read()

In [50]:
story_wv = re.sub(r'\[[0-9]*\]',' ',story)
story_wv = re.sub(r'\s+',' ',story_wv)
story_wv = story_wv.lower()
story_wv = re.sub(r'\d',' ',story_wv)
story_wv = re.sub(r'\s+',' ',story_wv)

In [51]:
story_wv[1:400]

' chapter it is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife. however little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters. “my dear mr. b'

In [52]:
## convert story to sentences
sentences = nltk.sent_tokenize(story_wv)

In [53]:
sentences[0:3]

['\ufeff chapter it is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.',
 'however little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters.',
 '“my dear mr. bennet,” said his lady to him one day, “have you heard that netherfield park is let at last?” mr. bennet replied that he had not.']

In [54]:
sentences_word = [nltk.word_tokenize(sentence) for sentence in sentences]

In [55]:
len(sentences_word)

4659

In [56]:
sentences_word[0:1]

[['\ufeff',
  'chapter',
  'it',
  'is',
  'a',
  'truth',
  'universally',
  'acknowledged',
  ',',
  'that',
  'a',
  'single',
  'man',
  'in',
  'possession',
  'of',
  'a',
  'good',
  'fortune',
  ',',
  'must',
  'be',
  'in',
  'want',
  'of',
  'a',
  'wife',
  '.']]

In [57]:
## Lets remove the stop words
for i in range(len(sentences_word)):
    sentences_word[i] = [word for word in sentences_word[i] if word not in stopwords.words('english')]

In [58]:
len(sentences_word)

4659

In [59]:
sentences_word[0:1]

[['\ufeff',
  'chapter',
  'truth',
  'universally',
  'acknowledged',
  ',',
  'single',
  'man',
  'possession',
  'good',
  'fortune',
  ',',
  'must',
  'want',
  'wife',
  '.']]

In [60]:
## Build the Model
model = Word2Vec(sentences_word, ## sentences--No of words in documents in the form of list
                               min_count=1, ## take only those words which have frequency greater or equal to 10
                               workers=4, ## No of CPU
                               size=50, ## Embeddeing Size or No of neurons in the hidden layer
                               window=3, ##max distance between target and actual words --like left and right distance
                               iter=10)

In [61]:
## Now our model in trainned and lets display How many words we have in our trainned Model
model.wv.vectors.shape
## Model has unique words --7215(size of vocabulary) and each word represented by 50

(7215, 50)

In [62]:
## Lets display the Vocabulary of the Model- unique words-7215
#model.wv.vocab

In [63]:
## Lets Display the Embedding of any unique word
model.wv["property"]  ## can see total size is 50

array([ 0.17268741, -0.12061057,  0.18004519,  0.04055322,  0.17896238,
        0.27593186,  0.14127517,  0.10682675,  0.07851309, -0.30606648,
       -0.05967107, -0.15174407, -0.24112347, -0.02954264,  0.19970766,
       -0.05134613,  0.14362542,  0.32420447,  0.06777881, -0.01279176,
       -0.0828521 , -0.10296668,  0.02555546, -0.04400079,  0.13299285,
       -0.06986132, -0.05113072, -0.19316839,  0.4596822 ,  0.17663167,
       -0.03815211,  0.08304726,  0.03584554,  0.03202673,  0.24119832,
        0.03388344,  0.10510354,  0.03883573, -0.23845185, -0.07372835,
       -0.02375774,  0.23932548, -0.01776404,  0.06423731, -0.19998643,
       -0.09625942, -0.11269914,  0.08175147,  0.05636334,  0.0183573 ],
      dtype=float32)

In [64]:
##  Lets display words which have similar meaning
model.wv.most_similar("satisfaction")

[('character', 0.9996856451034546),
 ('making', 0.9996483325958252),
 ('since', 0.9996414184570312),
 ('obliged', 0.9996348023414612),
 ('compliment', 0.99960857629776),
 ('subject', 0.9996055364608765),
 ('proved', 0.999605119228363),
 ('face', 0.9995934963226318),
 ('sisters', 0.9995932579040527),
 ('general', 0.9995889067649841)]