In [1]:
from nltk.corpus import brown

In [2]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [3]:
data=brown.sents(categories='editorial')[:100]
print(data)

[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...]


Basic NLP Pipeline
 1.Data Collection
 2.Tokenisation,StopWord and Stemming/Lematisation
 3.Building a common Vocab
 4.Vectorizing Documents
 5.Building classification/clustering

In [4]:
text='It was a very pleasant day, the weather was cool with light thunderstorms. I went to the market to buy fruits'

In [5]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [7]:
sent=sent_tokenize(text)
words_list=word_tokenize(sent[0].lower())
print(sent)
print(words_list)

['It was a very pleasant day, the weather was cool with light thunderstorms.', 'I went to the market to buy fruits']
['it', 'was', 'a', 'very', 'pleasant', 'day', ',', 'the', 'weather', 'was', 'cool', 'with', 'light', 'thunderstorms', '.']


In [8]:
#Stopword Removal
from nltk.corpus import stopwords
sw=set(stopwords.words('english'))
print(len(sw))


179


Filtering the words

In [9]:
def filter_words(words_list):
    useful_words=[w for w in words_list if w not in sw]
    return useful_words
print(useful_words)

NameError: name 'useful_words' is not defined

Tokenisation using Regular Expression
Problem with word tokenizer - Cant handle complex tokenisation

In [10]:
from nltk.tokenize import RegexpTokenizer
tokenizer=RegexpTokenizer("[a-z,A-Z]+")
text="Kabhi Kabhi Lagta Hai Apun ,Hi 2 Bhagwan Hai 1.0"
print(tokenizer.tokenize(text))

['Kabhi', 'Kabhi', 'Lagta', 'Hai', 'Apun', ',Hi', 'Bhagwan', 'Hai']


Stemming
  1.Process that transforms particular words(verbs,plural) into their radical form
  2.Preserve the semantics of the sentence without increasing the number of unique tokens
  3.jumps,jumping,jumped=>jump

In [11]:
text="Foxes love to make jumps. The Quick Brown Fox was seen jumping over the lovely dog from a 6 feet high wall"
w_list=tokenizer.tokenize(text.lower())
print(w_list)

['foxes', 'love', 'to', 'make', 'jumps', 'the', 'quick', 'brown', 'fox', 'was', 'seen', 'jumping', 'over', 'the', 'lovely', 'dog', 'from', 'a', 'feet', 'high', 'wall']


In [12]:
w_list=filter_words(w_list)
print(w_list)

['foxes', 'love', 'make', 'jumps', 'quick', 'brown', 'fox', 'seen', 'jumping', 'lovely', 'dog', 'feet', 'high', 'wall']


###Stemming
1.Snowball Stemmer(Multi Lingual)
2.Porter
3.Lancaster



In [13]:
from nltk.stem.snowball import PorterStemmer, SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
ps=PorterStemmer()

In [14]:
ps.stem('jumping')

'jump'

In [15]:
ps.stem("jerking")

'jerk'

In [16]:
ls=LancasterStemmer()
sb=SnowballStemmer('english')
print(ps.stem("teenager"))
print(ls.stem("teenager"))
print(sb.stem('couaris'))

teenag
teen
couari


In [17]:
def NLP(text):
    word_list=tokenizer.tokenize(text.lower())
    word_list=filter_words(word_list)
    for i in word_list:
        ps.stem(text)
    return word_list

    

In [18]:
NLP("Jackass is a lovely dog")

['jackass', 'lovely', 'dog']

Building Common Vocabulary and Vectorizing Documents(based on bag of words model)

In [19]:
corpus=[
    'Indian Cricket Team will win the World Cup, says Capt. Virat Kohli. World Cup will be held in Sri Lanka',
    'We will win next year election, says confident Indian PM Modi',
    'The nobel laurate will our hearts',
    'The movie Raazi is an Indian Spy thriller based on a real story',
]

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
cv=CountVectorizer()

In [22]:
vectorized_corpus=cv.fit_transform(corpus).toarray()

In [23]:
vectorized_corpus
print(len(vectorized_corpus[0]))

38


In [24]:
print(cv.vocabulary_) #Dictionary Word->Index

{'indian': 11, 'cricket': 5, 'team': 29, 'will': 34, 'win': 35, 'the': 30, 'world': 36, 'cup': 6, 'says': 25, 'capt': 3, 'virat': 32, 'kohli': 13, 'be': 2, 'held': 9, 'in': 10, 'sri': 27, 'lanka': 14, 'we': 33, 'next': 18, 'year': 37, 'election': 7, 'confident': 4, 'pm': 22, 'modi': 16, 'nobel': 19, 'laurate': 15, 'our': 21, 'hearts': 8, 'movie': 17, 'raazi': 23, 'is': 12, 'an': 0, 'spy': 26, 'thriller': 31, 'based': 1, 'on': 20, 'real': 24, 'story': 28}


In [25]:
#Given a vector, what is the sentence
import numpy as np
vector=np.ones((33,))
vector[3:7]=0
print(vector)

[1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [26]:
print(cv.inverse_transform(vector))

[array(['an', 'based', 'be', 'election', 'hearts', 'held', 'in', 'indian',
       'is', 'kohli', 'lanka', 'laurate', 'modi', 'movie', 'next',
       'nobel', 'on', 'our', 'pm', 'raazi', 'real', 'says', 'spy', 'sri',
       'story', 'team', 'the', 'thriller', 'virat'], dtype='<U9')]


In [27]:
##Effectively reduce size of vector
def myTokeniser(sentence):
    words=tokenizer.tokenize(sentence.lower())
    return filter_words(words) #Filters the stopwords

myTokeniser(corpus[0])


['indian',
 'cricket',
 'team',
 'win',
 'world',
 'cup,',
 'says',
 'capt',
 'virat',
 'kohli',
 'world',
 'cup',
 'held',
 'sri',
 'lanka']

In [28]:
cv=CountVectorizer(tokenizer=myTokeniser)
vectorized_corpus=cv.fit_transform(corpus)
vc=vectorized_corpus.toarray()
print(vc[0])
print(len(vc[0]))
v=vc[0]
print(cv.inverse_transform(vc[0]))

[0 1 0 1 1 1 0 0 1 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 2 0]
30
[array(['capt', 'cricket', 'cup', 'cup,', 'held', 'indian', 'kohli',
       'lanka', 'says', 'sri', 'team', 'virat', 'win', 'world'],
      dtype='<U9')]


Unigram Bag of Words Model

Features of BOW Model
  1.Unigrams,
  2. Bigrams,Trigrams
  3.n-grams

In [29]:
cv=CountVectorizer(tokenizer=myTokeniser, ngram_range=(1,3))
vectorized_corpus=cv.fit_transform(corpus)
vc=vectorized_corpus.toarray()
print(cv.vocabulary_)#The word along with its index


{'indian': 25, 'cricket': 9, 'team': 66, 'win': 75, 'world': 80, 'cup,': 15, 'says': 55, 'capt': 3, 'virat': 72, 'kohli': 32, 'cup': 12, 'held': 22, 'sri': 63, 'lanka': 35, 'indian cricket': 26, 'cricket team': 10, 'team win': 67, 'win world': 78, 'world cup,': 83, 'cup, says': 16, 'says capt': 56, 'capt virat': 4, 'virat kohli': 73, 'kohli world': 33, 'world cup': 81, 'cup held': 13, 'held sri': 23, 'sri lanka': 64, 'indian cricket team': 27, 'cricket team win': 11, 'team win world': 68, 'win world cup,': 79, 'world cup, says': 84, 'cup, says capt': 17, 'says capt virat': 57, 'capt virat kohli': 5, 'virat kohli world': 74, 'kohli world cup': 34, 'world cup held': 82, 'cup held sri': 14, 'held sri lanka': 24, 'next': 42, 'year': 85, 'election,': 18, 'confident': 6, 'pm': 48, 'modi': 38, 'win next': 76, 'next year': 43, 'year election,': 86, 'election, says': 19, 'says confident': 58, 'confident indian': 7, 'indian pm': 28, 'pm modi': 49, 'win next year': 77, 'next year election,': 44, 

In [30]:
print(len(vc[0]))

88


Tf-Idf Normalisation
  1.Avoid features that occur very often , because they contain very less information
  2.Information decreases as the frequency increases across differenet types of documents

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
tfidf_tokeniser=TfidfVectorizer(tokenizer=myTokeniser,ngram_range=(1,2),norm='l2')
vectorized_corpus=tfidf_tokeniser.fit_transform(corpus).toarray()
print(vectorized_corpus)

[[0.         0.         0.18364677 0.18364677 0.         0.
  0.18364677 0.18364677 0.18364677 0.18364677 0.18364677 0.18364677
  0.         0.         0.         0.18364677 0.18364677 0.11721935
  0.18364677 0.         0.         0.18364677 0.18364677 0.18364677
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.14478922 0.18364677 0.
  0.         0.         0.18364677 0.18364677 0.         0.18364677
  0.18364677 0.         0.         0.18364677 0.18364677 0.14478922
  0.         0.18364677 0.36729354 0.18364677 0.18364677 0.
  0.        ]
 [0.         0.         0.         0.         0.25277526 0.25277526
  0.         0.         0.         0.         0.         0.
  0.25277526 0.25277526 0.         0.         0.         0.16134317
  0.         0.25277526 0.         0.         0.         0.
  0.         0.         0.25277526 0.         0.         0.25277526
  0.25277526 0.       

In [33]:
print(tfidf_tokeniser.vocabulary_)

{'indian': 17, 'cricket': 6, 'team': 47, 'win': 53, 'world': 56, 'cup,': 10, 'says': 39, 'capt': 2, 'virat': 51, 'kohli': 21, 'cup': 8, 'held': 15, 'sri': 44, 'lanka': 23, 'indian cricket': 18, 'cricket team': 7, 'team win': 48, 'win world': 55, 'world cup,': 58, 'cup, says': 11, 'says capt': 40, 'capt virat': 3, 'virat kohli': 52, 'kohli world': 22, 'world cup': 57, 'cup held': 9, 'held sri': 16, 'sri lanka': 45, 'next': 29, 'year': 59, 'election,': 12, 'confident': 4, 'pm': 33, 'modi': 26, 'win next': 54, 'next year': 30, 'year election,': 60, 'election, says': 13, 'says confident': 41, 'confident indian': 5, 'indian pm': 19, 'pm modi': 34, 'nobel': 31, 'laurate': 24, 'hearts': 14, 'nobel laurate': 32, 'laurate hearts': 25, 'movie': 27, 'raazi': 35, 'spy': 42, 'thriller': 49, 'based': 0, 'real': 37, 'story': 46, 'movie raazi': 28, 'raazi indian': 36, 'indian spy': 20, 'spy thriller': 43, 'thriller based': 50, 'based real': 1, 'real story': 38}


In [34]:
ps.stem('quickly')

'quickli'