In [1]:
import nltk

In [32]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Vaibhav\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [2]:
from nltk.corpus import brown

In [3]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [4]:
brown.sents(categories='adventure')

[['Dan', 'Morgan', 'told', 'himself', 'he', 'would', 'forget', 'Ann', 'Turner', '.'], ['He', 'was', 'well', 'rid', 'of', 'her', '.'], ...]

# Tokenization and stopword Removal

In [5]:
document = """it was a pleasant day. The weather was cool and there were light showers. 
I went to the market to buy some fruits."""
sentence = '''Send all the 50 documents related to chapter 1,2,3 at prateek@cb.com'''


In [6]:
from nltk.tokenize import sent_tokenize,word_tokenize 

In [7]:
sents = sent_tokenize(document)
sents[0]

'it was a pleasant day.'

In [8]:
sentence.split(' ')

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapter',
 '1,2,3',
 'at',
 'prateek@cb.com']

In [9]:
word_tokenize(sentence)

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapter',
 '1,2,3',
 'at',
 'prateek',
 '@',
 'cb.com']

## Stopwords

In [10]:
from nltk.corpus import stopwords

In [11]:
sw = set(stopwords.words('english'))

In [12]:
def remove_stopwords(text,stopwords ):
    useful_words = [w for w in text if w not in stopwords]
    return useful_words

In [13]:
text = 'This chineese guy is very annoying'.split(' ')
remove_stopwords(text,sw)

['This', 'chineese', 'guy', 'annoying']

In [14]:
'not' in sw

True

# Tokenization using regular expression

In [27]:
sentence_rg = '''Send all the 50 documents related to chapter 1,2,3 at prateek@cb.com'''

In [19]:
from nltk.tokenize import RegexpTokenizer

In [49]:
tokenizer = RegexpTokenizer('[a-zA-Z@.]+')

In [50]:
useful_text = tokenizer.tokenize(sentence_rg)
useful_text

['Send',
 'all',
 'the',
 'documents',
 'related',
 'to',
 'chapter',
 'at',
 'prateek@cb.com']

# Stemming 
* Eg jump, jumping, jumps, jumped ===> jump
* Preserve the semantics of sentence without increasing the number of unique tokes

In [51]:
twxt = '''Foxes love to make jumps.The quick brown fox jumping over the lovely
dog from 6ft feet high wall'''

### Three types of stemming 
* Snowball (Multi Lingual)
* Porter
* Lancaster

In [52]:
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [53]:
ps =PorterStemmer()

In [57]:
ps.stem('jumps')

'jump'

In [58]:
ps.stem('loving')

'love'

In [62]:
ss = SnowballStemmer('english')

In [63]:
ss.stem('lovely')

'love'

In [70]:
from nltk.stem import WordNetLemmatizer

In [69]:
wn = WordNetLemmatizer()

In [74]:
wn.lemmatize('jumping',pos='v') 

'jump'

# Building a Vocab and Vectorization

In [94]:
# Sample Corpus - Contains 4 Documents, each document can have 1 or more sentences
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people.',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]

In [77]:
from sklearn.feature_extraction.text import CountVectorizer

In [79]:
cv = CountVectorizer()

In [80]:
vectorized_corpus = cv.fit_transform(corpus)

In [81]:
vectorized_corpus = vectorized_corpus.toarray()

In [83]:
vectorized_corpus

array([[0, 1, 0, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 2, 0, 1, 0, 2],
       [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
        0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int64)

In [101]:
print(vectorized_corpus.shape)
print(vectorized_corpus[0])

(4, 42)
[0 1 0 1 1 0 1 2 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0
 2 0 1 0 2]


In [93]:
cv.vocabulary_

{'indian': 12,
 'cricket': 6,
 'team': 31,
 'will': 37,
 'wins': 39,
 'world': 41,
 'cup': 7,
 'says': 27,
 'capt': 4,
 'virat': 35,
 'kohli': 14,
 'be': 3,
 'held': 11,
 'at': 1,
 'sri': 29,
 'lanka': 15,
 'we': 36,
 'win': 38,
 'next': 19,
 'lok': 17,
 'sabha': 26,
 'elections': 8,
 'confident': 5,
 'pm': 23,
 'the': 32,
 'nobel': 20,
 'laurate': 16,
 'won': 40,
 'hearts': 10,
 'of': 21,
 'people': 22,
 'movie': 18,
 'raazi': 24,
 'is': 13,
 'an': 0,
 'exciting': 9,
 'spy': 28,
 'thriller': 33,
 'based': 2,
 'upon': 34,
 'real': 25,
 'story': 30}

In [112]:
s = cv.inverse_transform(vectorized_corpus[3])
s

[array(['an', 'based', 'exciting', 'indian', 'is', 'movie', 'raazi',
        'real', 'spy', 'story', 'the', 'thriller', 'upon'], dtype='<U9')]

In [59]:
ss = SnowballStemmer('english')

# Vectorization and Stopward removal