In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import nltk 

In [2]:
from nltk.corpus import brown

In [3]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [4]:
data = brown.sents(categories='adventure')

In [21]:
# for i,sen in enumerate(data):
#     print(' '.join(sen),end= "")
# printed entire data of adventure

## Tokenisation

In [5]:
from nltk import sent_tokenize,word_tokenize

In [6]:
document = """It was a very pleasnt day. The weather was cool and there were light showers.
I went to the market to buy some fruits."""

sentence = "Send all the 50 documents related to chapers 1,2,3 at tilakparth123@gmail.com"

In [11]:
sents = sent_tokenize(document)
sents

['It was a very pleasnt day.',
 'The weather was cool and there were light showers.',
 'I went to the market to buy some fruits.']

In [15]:
words = word_tokenize(sentence)
words

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapers',
 '1,2,3',
 'at',
 'tilakparth123',
 '@',
 'gmail.com']

## Stopward Removal

In [21]:
from nltk.corpus import stopwords

In [22]:
sw  = set(stopwords.words('english'))

In [27]:
def remove_stopwords(texts,stopwords):
    useful_words = [w for w in texts if w not in stopwords]
    return useful_words

In [29]:
remove_stopwords(words,sw)

['Send',
 '50',
 'documents',
 'related',
 'chapers',
 '1,2,3',
 'tilakparth123',
 '@',
 'gmail.com']

### Tokenization using Regular Expression

In [30]:
sentence

'Send all the 50 documents related to chapers 1,2,3 at tilakparth123@gmail.com'

In [32]:
from nltk.tokenize import RegexpTokenizer 

In [35]:
tokenizer = RegexpTokenizer('[a-zA-Z@.]+') # refer to use regexp tokens https://www.regexpal.com/ 
useful_text =tokenizer.tokenize(sentence)
useful_text

['Send',
 'all',
 'the',
 'documents',
 'related',
 'to',
 'chapers',
 'at',
 'tilakparth',
 '@gmail.com']

## Stemming
- Process to transform particular words into radical forms
- Example jumps,jumping,jump=>jump

In [36]:
text = """Foxes love to make jumps. The quick brown fox was seen jumping over the 
lovely dog from a 6ft feet high wall"""

In [37]:
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [38]:
ps = PorterStemmer()

In [46]:
ps.stem('jumping')

'jump'

In [47]:
ss = SnowballStemmer('english')

In [49]:
ss.stem('jumping')

'jump'

In [56]:
from nltk.stem import WordNetLemmatizer
wn = WordNetLemmatizer()
wn.lemmatize("jumping")

'jumping'

## Building a Vocab & Vectorization

In [57]:
# Sample Corpus - Contains 4 Documents, each document can have 1 or more sentences
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people.',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]

In [59]:
from sklearn.feature_extraction.text import CountVectorizer

In [60]:
cv = CountVectorizer()

In [61]:
vectorized_corpus = cv.fit_transform(corpus)

In [66]:
vectorized_corpus?

In [68]:
vectorized_corpus= vectorized_corpus.toarray()

In [72]:
vectorized_corpus[0]
len(vectorized_corpus[0])

42

In [73]:
print(cv.vocabulary_)

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'wins': 39, 'world': 41, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kohli': 14, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'win': 38, 'next': 19, 'lok': 17, 'sabha': 26, 'elections': 8, 'confident': 5, 'pm': 23, 'the': 32, 'nobel': 20, 'laurate': 16, 'won': 40, 'hearts': 10, 'of': 21, 'people': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30}


In [75]:
len(cv.vocabulary_.keys())

42

In [79]:
#Reverse mapping
numbers = vectorized_corpus[0]
numbers

array([0, 1, 0, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 2, 0, 1, 0, 2],
      dtype=int64)

In [81]:
s =cv.inverse_transform(numbers)
print(s)

[array(['at', 'be', 'capt', 'cricket', 'cup', 'held', 'indian', 'kohli',
       'lanka', 'says', 'sri', 'team', 'virat', 'will', 'wins', 'world'],
      dtype='<U9')]


### Vectorization with Stopword Removal

In [89]:
def myTokenizer(document):
    words = tokenizer.tokenize(document.lower())
    words=remove_stopwords(words,sw)
    return words

In [95]:
cv = CountVectorizer(tokenizer=myTokenizer)

In [96]:
vectorized_corpus = cv.fit_transform(corpus)

In [99]:
vectorized_corpus= vectorized_corpus.toarray()

In [102]:
print(vectorized_corpus)
print(len(vectorized_corpus[0]))

[[0 1 0 1 2 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 2]
 [0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0]]
33


In [104]:
cv.inverse_transform(vectorized_corpus)

[array(['capt.', 'cricket', 'cup', 'held', 'indian', 'kohli.', 'lanka.',
        'says', 'sri', 'team', 'virat', 'wins', 'world'], dtype='<U9'),
 array(['confident', 'elections', 'indian', 'lok', 'next', 'pm', 'sabha',
        'says', 'win'], dtype='<U9'),
 array(['hearts', 'laurate', 'nobel', 'people.'], dtype='<U9'),
 array(['based', 'exciting', 'indian', 'movie', 'raazi', 'real', 'spy',
        'story.', 'thriller', 'upon'], dtype='<U9')]

In [105]:
  test_corpus = [
      "Indian cricket rocks!"
  ]

In [108]:
cv.transform(test_corpus).toarray()

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

#### More ways to Create Features

In [124]:
sent_1 = ["this is good movie"]
sent_2 = ["this is not good movie"]
sent_3 = ["this was good movie"]

In [120]:
cv = CountVectorizer(ngram_range=(1,3))

In [121]:
docs = [sent_1[0],sent_2[0]]
cv.fit_transform(docs).toarray()

array([[1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0],
       [1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1]], dtype=int64)

In [122]:
cv.vocabulary_

{'this': 11,
 'is': 2,
 'good': 0,
 'movie': 7,
 'this is': 12,
 'is good': 3,
 'good movie': 1,
 'this is good': 13,
 'is good movie': 4,
 'not': 8,
 'is not': 5,
 'not good': 9,
 'this is not': 14,
 'is not good': 6,
 'not good movie': 10}

### Tf-idf Normalisation

In [133]:
corpus = [sent_1[0],sent_2[0],sent_3[0]]

In [129]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [130]:
tfidf = TfidfVectorizer()

In [134]:
vc = tfidf.fit_transform(corpus)

In [138]:
print(vc.toarray())

[[0.46333427 0.59662724 0.46333427 0.         0.46333427 0.        ]
 [0.3645444  0.46941728 0.3645444  0.61722732 0.3645444  0.        ]
 [0.41285857 0.         0.41285857 0.         0.41285857 0.69903033]]


In [137]:
tfidf.vocabulary_

{'this': 4, 'is': 1, 'good': 0, 'movie': 2, 'not': 3, 'was': 5}