In [None]:
import nltk
import sklearn

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

### Bag of Words

The Bag of Words (BoW) model is a fundamental and straightforward representation of text data used in natural language processing (NLP) and information retrieval. It treats text as an unordered set of words, ignoring grammar, syntax, and word order, and focuses solely on the frequency of words within a document or a collection of documents. The key idea behind the Bag of Words model is to represent text data as a "bag" or "multiset" of words, where each word is treated as a unique feature.

Advantages of the Bag of Words model:

Simplicity and ease of implementation.
Suitable for various text classification and analysis tasks.
Can handle large text corpora efficiently.
Disadvantages of the Bag of Words model:

Loss of word order and contextual information.
Does not capture semantic meaning.
Large feature space, especially for extensive vocabularies.

In [None]:
train_text = [
    "The movie was exciting, there were some exciting stunts in the movie",
    "I hated the movie, absolutely hated it",
    "This movie was very good, should not be missed, good!",
    "Everything about it was ok, acting ok, direction ok",
    "Great direction, great cinematography, decent acting"
]

train_text

['The movie was exciting, there were some exciting stunts in the movie',
 'I hated the movie, absolutely hated it',
 'This movie was very good, should not be missed, good!',
 'Everything about it was ok, acting ok, direction ok',
 'Great direction, great cinematography, decent acting']

In [None]:
count_vectorizer = CountVectorizer(stop_words='english')

count_vectorizer

In [None]:
count_vectorizer.fit(train_text)

###### get all the feature names

In [None]:
count_vectorizer.get_feature_names_out()

array(['absolutely', 'acting', 'cinematography', 'decent', 'direction',
       'exciting', 'good', 'great', 'hated', 'missed', 'movie', 'ok',
       'stunts'], dtype=object)

In [None]:
len(count_vectorizer.get_feature_names_out())

13

In [None]:
count_vectorizer.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

##### observe each feature has an unique id

In [None]:
count_vectorizer.vocabulary_

{'movie': 10,
 'exciting': 5,
 'stunts': 12,
 'hated': 8,
 'absolutely': 0,
 'good': 6,
 'missed': 9,
 'ok': 11,
 'acting': 1,
 'direction': 4,
 'great': 7,
 'cinematography': 2,
 'decent': 3}

In [None]:
transformed_vector = count_vectorizer.transform(train_text)

transformed_vector

<5x13 sparse matrix of type '<class 'numpy.int64'>'
	with 17 stored elements in Compressed Sparse Row format>

In [None]:
transformed_array = transformed_vector.toarray()

print(transformed_array)

[[0 0 0 0 0 2 0 0 0 0 2 0 1]
 [1 0 0 0 0 0 0 0 2 0 1 0 0]
 [0 0 0 0 0 0 2 0 0 1 1 0 0]
 [0 1 0 0 1 0 0 0 0 0 0 3 0]
 [0 1 1 1 1 0 0 2 0 0 0 0 0]]


In [None]:
term_index_df = pd.DataFrame(
    list(count_vectorizer.vocabulary_.items()),
    columns=['Term', 'Index']
)

term_index_df = term_index_df.sort_values(by='Index')

term_index_df

Unnamed: 0,Term,Index
4,absolutely,0
8,acting,1
11,cinematography,2
12,decent,3
9,direction,4
1,exciting,5
5,good,6
10,great,7
3,hated,8
6,missed,9


In [None]:
term_index_df['Sentence 0'] = transformed_array[0]

term_index_df['Sentence 1'] = transformed_array[1]

term_index_df['Sentence 2'] = transformed_array[2]

term_index_df['Sentence 3'] = transformed_array[3]

term_index_df['Sentence 4'] = transformed_array[4]

term_index_df

Unnamed: 0,Term,Index,Sentence 0,Sentence 1,Sentence 2,Sentence 3,Sentence 4
4,absolutely,0,0,1,0,0,0
8,acting,1,0,0,0,1,1
11,cinematography,2,0,0,0,0,1
12,decent,3,0,0,0,0,1
9,direction,4,0,0,0,1,1
1,exciting,5,2,0,0,0,0
5,good,6,0,0,2,0,0
10,great,7,0,0,0,0,2
3,hated,8,0,2,0,0,0
6,missed,9,0,0,1,0,0


### Bag of Ngrams

The Bag of N-grams model is an extension of the traditional Bag of Words (BoW) model in natural language processing (NLP). While the BoW model represents text as a collection of individual words, the Bag of N-grams model takes into account sequences of words, known as "n-grams," to capture some degree of word order and local context information.

Instead of considering individual words, the model considers sequences of "n" consecutive words as units. These units are called "n-grams." Common choices for "n" include 2 (bigrams), 3 (trigrams), or even higher values, depending on the desired level of context.

The Bag of N-grams model offers advantages over the traditional Bag of Words model by capturing some level of word order and local context information. For example, if you use bigrams (2-grams), you can capture adjacent word pairs in the text, which may help in tasks like sentiment analysis, text classification, and information retrieval.

Advantages of the Bag of N-grams model:

Captures some degree of word order and local context.
Can be effective for tasks that rely on word associations or collocations.
Flexible, as you can adjust the value of "n" to control the level of context.
Disadvantages of the Bag of N-grams model:

Higher dimensionality compared to the traditional Bag of Words.
Still lacks a complete understanding of sentence or document structure.
Limited in capturing long-range dependencies in text.
The choice of "n" (the size of n-grams) is a hyperparameter that depends on the specific NLP task and dataset. Smaller values of "n" capture shorter local context, while larger values capture longer sequences of words. The Bag of N-grams model is a useful technique for text analysis when you want to consider sequences of words beyond individual terms.

In [None]:
n_gram_vectorizer = CountVectorizer(ngram_range=(2, 2))

n_gram_vectorizer

In [None]:
train_text

['The movie was exciting, there were some exciting stunts in the movie',
 'I hated the movie, absolutely hated it',
 'This movie was very good, should not be missed, good!',
 'Everything about it was ok, acting ok, direction ok',
 'Great direction, great cinematography, decent acting']

In [None]:
transformed_vector = n_gram_vectorizer.fit_transform(train_text)

transformed_vector

<5x35 sparse matrix of type '<class 'numpy.int64'>'
	with 37 stored elements in Compressed Sparse Row format>

In [None]:
n_gram_vectorizer.vocabulary_

{'the movie': 27,
 'movie was': 20,
 'was exciting': 31,
 'exciting there': 10,
 'there were': 28,
 'were some': 34,
 'some exciting': 25,
 'exciting stunts': 9,
 'stunts in': 26,
 'in the': 16,
 'hated the': 15,
 'movie absolutely': 19,
 'absolutely hated': 1,
 'hated it': 14,
 'this movie': 29,
 'was very': 33,
 'very good': 30,
 'good should': 11,
 'should not': 24,
 'not be': 21,
 'be missed': 3,
 'missed good': 18,
 'everything about': 8,
 'about it': 0,
 'it was': 17,
 'was ok': 32,
 'ok acting': 22,
 'acting ok': 2,
 'ok direction': 23,
 'direction ok': 7,
 'great direction': 13,
 'direction great': 6,
 'great cinematography': 12,
 'cinematography decent': 4,
 'decent acting': 5}

In [None]:
term_index_df = pd.DataFrame(
    list(n_gram_vectorizer.vocabulary_.items()),
    columns=['Term', 'Index']
)

term_index_df = term_index_df.sort_values(by='Index')

term_index_df

Unnamed: 0,Term,Index
23,about it,0
12,absolutely hated,1
27,acting ok,2
20,be missed,3
33,cinematography decent,4
34,decent acting,5
31,direction great,6
29,direction ok,7
22,everything about,8
7,exciting stunts,9


In [None]:
transformed_array = transformed_vector.toarray()

print(transformed_array)

[[0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 1 2 1 0 0 1 0 0 1]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0 1 1 0 0 1 0]
 [1 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 1 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [None]:
term_index_df['Sentence 0'] = transformed_array[0]

term_index_df['Sentence 1'] = transformed_array[1]

term_index_df['Sentence 2'] = transformed_array[2]

term_index_df['Sentence 3'] = transformed_array[3]

term_index_df['Sentence 4'] = transformed_array[4]

term_index_df

Unnamed: 0,Term,Index,Sentence 0,Sentence 1,Sentence 2,Sentence 3,Sentence 4
23,about it,0,0,0,0,1,0
12,absolutely hated,1,0,1,0,0,0
27,acting ok,2,0,0,0,1,0
20,be missed,3,0,0,1,0,0
33,cinematography decent,4,0,0,0,0,1
34,decent acting,5,0,0,0,0,1
31,direction great,6,0,0,0,0,1
29,direction ok,7,0,0,0,1,0
22,everything about,8,0,0,0,1,0
7,exciting stunts,9,1,0,0,0,0


#### Unigrams and bigrams

In [None]:
n_gram_vectorizer = CountVectorizer(ngram_range=(1, 2))

transformed_vector = n_gram_vectorizer.fit_transform(train_text)

n_gram_vectorizer.vocabulary_

{'the': 48,
 'movie': 34,
 'was': 56,
 'exciting': 17,
 'there': 50,
 'were': 60,
 'some': 44,
 'stunts': 46,
 'in': 28,
 'the movie': 49,
 'movie was': 36,
 'was exciting': 57,
 'exciting there': 19,
 'there were': 51,
 'were some': 61,
 'some exciting': 45,
 'exciting stunts': 18,
 'stunts in': 47,
 'in the': 29,
 'hated': 25,
 'absolutely': 2,
 'it': 30,
 'hated the': 27,
 'movie absolutely': 35,
 'absolutely hated': 3,
 'hated it': 26,
 'this': 52,
 'very': 54,
 'good': 20,
 'should': 42,
 'not': 37,
 'be': 6,
 'missed': 32,
 'this movie': 53,
 'was very': 59,
 'very good': 55,
 'good should': 21,
 'should not': 43,
 'not be': 38,
 'be missed': 7,
 'missed good': 33,
 'everything': 15,
 'about': 0,
 'ok': 39,
 'acting': 4,
 'direction': 12,
 'everything about': 16,
 'about it': 1,
 'it was': 31,
 'was ok': 58,
 'ok acting': 40,
 'acting ok': 5,
 'ok direction': 41,
 'direction ok': 14,
 'great': 22,
 'cinematography': 8,
 'decent': 10,
 'great direction': 24,
 'direction great': 13

In [None]:
term_index_df = pd.DataFrame(
    list(n_gram_vectorizer.vocabulary_.items()),
    columns=['Term', 'Index']
)

term_index_df = term_index_df.sort_values(by='Index')

term_index_df

Unnamed: 0,Term,Index
42,about,0
47,about it,1
20,absolutely,2
24,absolutely hated,3
44,acting,4
...,...,...
11,was exciting,57
49,was ok,58
34,was very,59
5,were,60


In [None]:
transformed_array = transformed_vector.toarray()

print(transformed_array)

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 2 0
  1 0 0 0 0 0 0 0 1 1 1 1 2 2 1 1 0 0 0 0 1 1 0 0 1 1]
 [0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 1 0 0 1 0 0 0 1 1
  0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 2 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0
  1 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 1 0 0]
 [1 1 0 0 1 1 0 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0
  0 0 0 3 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0]
 [0 0 0 0 1 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 2 1 1 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [None]:
term_index_df['Sentence 0'] = transformed_array[0]

term_index_df['Sentence 1'] = transformed_array[1]

term_index_df['Sentence 2'] = transformed_array[2]

term_index_df['Sentence 3'] = transformed_array[3]

term_index_df['Sentence 4'] = transformed_array[4]

term_index_df.head(10)

Unnamed: 0,Term,Index,Sentence 0,Sentence 1,Sentence 2,Sentence 3,Sentence 4
42,about,0,0,0,0,1,0
47,about it,1,0,0,0,1,0
20,absolutely,2,0,1,0,0,0
24,absolutely hated,3,0,1,0,0,0
44,acting,4,0,0,0,1,1
51,acting ok,5,0,0,0,1,0
31,be,6,0,0,1,0,0
39,be missed,7,0,0,1,0,0
55,cinematography,8,0,0,0,0,1
60,cinematography decent,9,0,0,0,0,1


In [None]:
term_index_df.tail(10)

Unnamed: 0,Term,Index,Sentence 0,Sentence 1,Sentence 2,Sentence 3,Sentence 4
26,this,52,0,0,1,0,0
33,this movie,53,0,0,1,0,0
27,very,54,0,0,1,0,0
35,very good,55,0,0,1,0,0
2,was,56,1,0,1,1,0
11,was exciting,57,1,0,0,0,0
49,was ok,58,0,0,0,1,0
34,was very,59,0,0,1,0,0
5,were,60,1,0,0,0,0
14,were some,61,1,0,0,0,0
