In [1]:
from nltk.corpus import brown

### 1. Data Collection

In [2]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [3]:
data = brown.sents(categories='editorial')[:1]

In [4]:
print(data)

[['Assembly', 'session', 'brought', 'much', 'good']]


In [5]:
print(len(data))

1


# Basic NLP Pipeline
- Data Collection
- Tokenization, Stopword, Stemming
- Building a common vocab
- Vectorizing the documents
- Performing Classification/ Clustering

### 2. Tokenization

In [6]:
text = "It was a very pleasent day, the weather was cool their were light showers. I went to the market to buy some fruits."
print(text)

It was a very pleasent day, the weather was cool their were light showers. I went to the market to buy some fruits.


In [7]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [8]:
# This function breaks my text into sentences

sents = sent_tokenize(text)
print(sents)

['It was a very pleasent day, the weather was cool their were light showers.', 'I went to the market to buy some fruits.']


In [9]:
# word_tokenize function breaks my text into words

word_list = word_tokenize(sents[0].lower())
print(word_list)

['it', 'was', 'a', 'very', 'pleasent', 'day', ',', 'the', 'weather', 'was', 'cool', 'their', 'were', 'light', 'showers', '.']


### 3. Stopword Removal

In [10]:
from nltk.corpus import stopwords


# stopwords are those words which are not important in intuting the meaning of a sentence(i.e. if,does,this etc)
sw = set(stopwords.words('english'))

In [11]:
print(sw)

{'than', 'during', 'between', 'no', 'again', 'out', "won't", 'other', "isn't", 'them', 'if', 'until', 'is', 'before', 'as', 'under', 'when', 'am', 'to', 'mustn', 'they', "wouldn't", "couldn't", 'yours', 'been', 'off', 'most', 'should', 'their', 'into', 'all', 'nor', 'by', "you've", 'my', 'theirs', 'below', 'and', 'wouldn', 'i', 'these', 're', 'himself', 'shouldn', 'yourself', 'itself', 'while', 'those', "shan't", 'each', 'our', 'her', 'further', 'such', 'here', 'd', 'didn', "doesn't", 'haven', 'needn', "needn't", "don't", "haven't", 'own', 'who', "mustn't", 'same', 'aren', 'what', 'after', 'there', 'ain', 'had', 'so', 'being', 'for', "that'll", "didn't", 'have', "shouldn't", "you'll", 'in', 'only', 'themselves', 'was', 'y', "should've", 'having', 'on', 'his', 'll', 'that', 'ourselves', 'more', 'too', 'him', 'it', 's', 'hasn', 'about', 'can', "weren't", "hadn't", 'has', 'were', 'because', 'me', 'then', 'above', 'we', 'she', 'once', 'doesn', 'against', "you're", 'down', 'through', 'a', '

In [12]:
# Filter the words from your sentence
def filter_words(word_list):
    
    useful_words = [w for w in word_list if w not in sw]
    return useful_words

In [13]:
from nltk.tokenize import RegexpTokenizer

### To Practice Regular Expression
- https://www.regexpal.com

In [14]:
# RegexpTokenizer(Regular Expression)

tokenizer_obj = RegexpTokenizer("[a-zA-Z@]+")


In [15]:
text = "Send all the 50 documents related to clauses 1,2,3 at abc@xyz.com"

print(tokenizer_obj.tokenize(text))

['Send', 'all', 'the', 'documents', 'related', 'to', 'clauses', 'at', 'abc@xyz', 'com']


### 4. Stemming
- Process that transform particular words(verbs,plurals) into their radical form
- Preserve the semantics of the sentence without increasing the no. of unique tokens
- jumps, jumping, jumped, jump ==> jump

In [16]:
text = """Foxes love to make jumps. The quick brown fox was seen jumping over the 
            lovely dog from a 6ft feet high wall"""

words_list = tokenizer_obj.tokenize(text.lower())
print(word_list)

['it', 'was', 'a', 'very', 'pleasent', 'day', ',', 'the', 'weather', 'was', 'cool', 'their', 'were', 'light', 'showers', '.']


In [17]:
word_list = filter_words(words_list) # Remove stopwords
print(words_list)

['foxes', 'love', 'to', 'make', 'jumps', 'the', 'quick', 'brown', 'fox', 'was', 'seen', 'jumping', 'over', 'the', 'lovely', 'dog', 'from', 'a', 'ft', 'feet', 'high', 'wall']


### Types of Stemming
- 1) Snowball Stemmer(Multilingual)
- 2) Porter Stemmer (only English)
- 3) Lancaster Stemmer(only English)

In [18]:
from nltk.stem.snowball import PorterStemmer,SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

ps = PorterStemmer()

In [19]:
ps.stem("jumps")

'jump'

In [20]:
ps.stem("jumping")

'jump'

In [21]:
ps.stem("lovely")

'love'

In [22]:
#                                     Lancaster Stemmer

ls = LancasterStemmer()
ls.stem("teeth")

'tee'

In [23]:
#                                       Snowball Stemmer

ss = SnowballStemmer('english')
print(ss.stem('lovely'))

love


In [24]:
ss_french = SnowballStemmer('french')
print(ss_french.stem('courais'))

cour


# Bag of Words - Vectorization, Unigram Features

In [25]:
corpus = [
            'Indian cricket team will wins World Cup, says Capt. Virat Kohli',
            'We will win next Lok Sabha Elections, says confident Indian PM',
            'The noble laurate won the hearts of the people',
            'The movie Raazi is an exciting Indian Spy thriller based upon a real story'
]

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
cv = CountVectorizer()
cv?

In [28]:
vectorized_corpus = cv.fit_transform(corpus).toarray()

In [29]:
#vectorized_corpus
print(vectorized_corpus)
print(len(vectorized_corpus[0]))

[[0 0 1 0 1 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 1 0 1 0
  1]
 [0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 0
  0]
 [0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 1
  0]
 [1 1 0 0 0 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 1 1 0 0 1 1 0 1 1 1 0 0 0 0 0 0
  0]]
37


In [30]:
print(cv.vocabulary_) # Dictionary -Word -> index

{'indian': 9, 'cricket': 4, 'team': 26, 'will': 32, 'wins': 34, 'world': 36, 'cup': 5, 'says': 23, 'capt': 2, 'virat': 30, 'kohli': 11, 'we': 31, 'win': 33, 'next': 15, 'lok': 13, 'sabha': 22, 'elections': 6, 'confident': 3, 'pm': 19, 'the': 27, 'noble': 16, 'laurate': 12, 'won': 35, 'hearts': 8, 'of': 17, 'people': 18, 'movie': 14, 'raazi': 20, 'is': 10, 'an': 0, 'exciting': 7, 'spy': 24, 'thriller': 28, 'based': 1, 'upon': 29, 'real': 21, 'story': 25}


In [31]:
# Give a Vector what is the sentence
import numpy as np
vector = np.ones((37,))
vector[3:7] =0

print(vector)
print(len(vector))

[1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
37


In [32]:
print(cv.inverse_transform(vector))

[array(['an', 'based', 'capt', 'exciting', 'hearts', 'indian', 'is',
       'kohli', 'laurate', 'lok', 'movie', 'next', 'noble', 'of',
       'people', 'pm', 'raazi', 'real', 'sabha', 'says', 'spy', 'story',
       'team', 'the', 'thriller', 'upon', 'virat', 'we', 'will', 'win',
       'wins', 'won', 'world'], dtype='<U9')]


In [33]:
cv.vocabulary_["capt"]

2

In [35]:
### Effectively reduce the size of the vector
def myTokenizer(sentence):
    words = tokenizer_obj.tokenize(sentence.lower())
    return filter_words(words)
myTokenizer(corpus[0])

['indian',
 'cricket',
 'team',
 'wins',
 'world',
 'cup',
 'says',
 'capt',
 'virat',
 'kohli']

### Unigram

In [41]:
# CountVectorizer is predefined library used to vectorize a corpus. It reduces our effort!!!
cv = CountVectorizer(tokenizer = myTokenizer)

#cv.fit_transform(corpus) is a func in CountVectorizer that vectorize our corpus
vectorized_corpus = cv.fit_transform(corpus)

# toarray( ) is used to see the frequency of the words in our corpus
vc = vectorized_corpus.toarray()

print(vc[0])
print(len(vc[0]))

vc[0][0] = 1
v = vc[0]
print(v)
cv.inverse_transform(v)

[0 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 1]
30
[1 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 1]


[array(['based', 'capt', 'cricket', 'cup', 'indian', 'kohli', 'says',
        'team', 'virat', 'wins', 'world'], dtype='<U9')]

# Features in Bag of Words Model
- Unigrams
- Bigrams, Trigrams
- N-grams

In [43]:
'''to see Bigrams, Trigrams and N-grams we use parameter "ngram_range " which can take value's as 
    (1,2)=> this indicates the mixture of both unigram and bigram
    (1,1) => only Unigram
    (2,2) => only Bigram
    (3,3) => only Trigram
    .
    .
    .
    .
    .
    (n,n) => n-gram'''

cv = CountVectorizer(tokenizer=myTokenizer,ngram_range = (1,2))

vectorized_corpus = cv.fit_transform(corpus)
vc = vectorized_corpus.toarray()

print(cv.vocabulary_)

{'indian': 16, 'cricket': 6, 'team': 45, 'wins': 55, 'world': 57, 'cup': 8, 'says': 39, 'capt': 2, 'virat': 51, 'kohli': 20, 'indian cricket': 17, 'cricket team': 7, 'team wins': 46, 'wins world': 56, 'world cup': 58, 'cup says': 9, 'says capt': 40, 'capt virat': 3, 'virat kohli': 52, 'win': 53, 'next': 27, 'lok': 23, 'sabha': 37, 'elections': 10, 'confident': 4, 'pm': 32, 'win next': 54, 'next lok': 28, 'lok sabha': 24, 'sabha elections': 38, 'elections says': 11, 'says confident': 41, 'confident indian': 5, 'indian pm': 18, 'noble': 29, 'laurate': 21, 'hearts': 14, 'people': 31, 'noble laurate': 30, 'laurate hearts': 22, 'hearts people': 15, 'movie': 25, 'raazi': 33, 'exciting': 12, 'spy': 42, 'thriller': 47, 'based': 0, 'upon': 49, 'real': 35, 'story': 44, 'movie raazi': 26, 'raazi exciting': 34, 'exciting indian': 13, 'indian spy': 19, 'spy thriller': 43, 'thriller based': 48, 'based upon': 1, 'upon real': 50, 'real story': 36}


In [44]:
print(len(vc[0]))

59


# Tf - idf Normalisation
- Avoid features that occur very often, because they contain less information
- Information increases as the number of occurence increases differnt types of 
- So we define another term - term-document-frequency which associates a weight with every term