# Building a Vocab and Vectorization

In [1]:
# Sample Corpus - Contains 4 Documents, Each Document Can Have 1 Or More Sentences

corpus = [
    'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
    'We will win next Lok Sabha Elections, says confident Indian PM',
    'The nobel laurate won the hearts of the people.',
    'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]

In [2]:
# Each Doc => 1 Feature Vector
# For our Corpus we will get 4 Feature Vectors

In [3]:
# Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

In [4]:
vectorized_corpus = cv.fit_transform(corpus)

print(type(vectorized_corpus))

<class 'scipy.sparse.csr.csr_matrix'>


In [5]:
vectorized_corpus = vectorized_corpus.toarray()

In [6]:
print(type(vectorized_corpus))

<class 'numpy.ndarray'>


In [7]:
print(vectorized_corpus[0])

[0 1 0 1 1 0 1 2 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0
 2 0 1 0 2]


In [8]:
print(cv.vocabulary_)

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'wins': 39, 'world': 41, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kohli': 14, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'win': 38, 'next': 19, 'lok': 17, 'sabha': 26, 'elections': 8, 'confident': 5, 'pm': 23, 'the': 32, 'nobel': 20, 'laurate': 16, 'won': 40, 'hearts': 10, 'of': 21, 'people': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30}


In [9]:
print(len(vectorized_corpus[0]))

42


In [10]:
print(len(cv.vocabulary_.keys()))

42


In [11]:
# Reverse Mapping !

numbers = vectorized_corpus[2]
print(numbers)

[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 3 0 0 0 0
 0 0 0 1 0]


In [12]:
print(cv.inverse_transform(numbers))

[array(['hearts', 'laurate', 'nobel', 'of', 'people', 'the', 'won'],
      dtype='<U9')]


## Vectorization With Stopword Removal

In [13]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

sw = set(stopwords.words('english'))

tokenizer = RegexpTokenizer('[a-zA-Z@.]+')

def remove_stopwords(text, stopwords):
    useful_words = [w for w in text if w not in stopwords]
    return useful_words

def myTokenizer(document):
    words = tokenizer.tokenize(document.lower())
    # Remove Stopwords
    words = remove_stopwords(words, sw)
    return words

In [14]:
cv = CountVectorizer(tokenizer=myTokenizer)

In [15]:
vectorized_corpus = cv.fit_transform(corpus).toarray()
print(vectorized_corpus)

[[0 1 0 1 2 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 2]
 [0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0]]


In [16]:
print(len(vectorized_corpus[0]))

33


In [17]:
cv.inverse_transform(vectorized_corpus)

[array(['capt.', 'cricket', 'cup', 'held', 'indian', 'kohli.', 'lanka.',
        'says', 'sri', 'team', 'virat', 'wins', 'world'], dtype='<U9'),
 array(['confident', 'elections', 'indian', 'lok', 'next', 'pm', 'sabha',
        'says', 'win'], dtype='<U9'),
 array(['hearts', 'laurate', 'nobel', 'people.'], dtype='<U9'),
 array(['based', 'exciting', 'indian', 'movie', 'raazi', 'real', 'spy',
        'story.', 'thriller', 'upon'], dtype='<U9')]

In [18]:
# For Test Data

test_corpus = [
    'Indian Cricket Rock !'
]

In [19]:
cv.transform(test_corpus)

<1x33 sparse matrix of type '<class 'numpy.int64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [20]:
print(cv.transform(test_corpus).toarray())

[[0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [21]:
print(cv.vocabulary_)

{'indian': 9, 'cricket': 3, 'team': 26, 'wins': 31, 'world': 32, 'cup': 4, 'says': 22, 'capt.': 1, 'virat': 29, 'kohli.': 10, 'held': 8, 'sri': 24, 'lanka.': 11, 'win': 30, 'next': 15, 'lok': 13, 'sabha': 21, 'elections': 5, 'confident': 2, 'pm': 18, 'nobel': 16, 'laurate': 12, 'hearts': 7, 'people.': 17, 'movie': 14, 'raazi': 19, 'exciting': 6, 'spy': 23, 'thriller': 27, 'based': 0, 'upon': 28, 'real': 20, 'story.': 25}
