### Assignment on Bag of Words and TF, IDF

Used dataset : https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [73]:
# Import Libraries
import pandas as pd
import numpy as np
import os

In [3]:
# Load Dataset
df = pd.read_csv(os.path.join('dataset', 'imdbMovieReview','imdbDataset.csv'))
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
# Problem 1
# Apply all the preprocessing techniques that you think are necessary

# Check null values
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [16]:
# Check duplicates values
df.duplicated().sum()

# Remove duplicates values
df = df.drop_duplicates()
df.duplicated().sum()

0

In [15]:
# Lowercasing
df['review'] = df['review'].str.lower()

# Removing Whitespace
df['review'] = df['review'].str.strip()

# Remove HTML tags
df['review'] = df['review'].str.replace('<.*?>','')

# Remove URLs
df['review'] = df['review'].str.replace('https?://\S+|www\.\S+','')

# Remove Punctuation
import string
translator = str.maketrans("", "", string.punctuation)
df['review'] = df['review'].str.translate(translator)

df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production br br the filmin...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [None]:
# Problem 2
# Find out the number of words in the entire corpus and also the total number of unique words(vocabulary) using just python

In [31]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

total_words = []
lemmatizer = WordNetLemmatizer()

for review in df['review']:
    words = [lemmatizer.lemmatize(word, pos='v') for word in word_tokenize(review) if word not in stopwords.words('english')]
    total_words.extend(words)

vocabulary = list(set(total_words))

In [34]:
print('Total words: ', len(total_words))
print('Vocabulary: ', len(vocabulary))

Total words:  6094923
Vocabulary:  168055


In [105]:
# Problem 3
# Apply One Hot Encoding

from sklearn.preprocessing import OneHotEncoder

# Create the encoder
encoder = OneHotEncoder(handle_unknown='ignore')


data = [lemmatizer.lemmatize(word, pos='v') for word in word_tokenize(df['review'][0]) if word not in stopwords.words('english')]
data = np.array(data).reshape(-1,1)

# Fit the encoder to the vocabulary
encoder.fit(np.array(vocabulary).reshape(-1,1))


# Transform the text data into one-hot encoded vectors
one_hot_encoded_data = encoder.transform(data).toarray()

print(one_hot_encoded_data.shape)

(171, 168055)


In [99]:
print(len(data))
print(len(vocabulary))

171
168055


In [50]:
# Problem 4
# Apply bag words and find the vocabulary also find the times each word has occured

from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import sent_tokenize

vectorizer = CountVectorizer()
sentences = sent_tokenize(df['review'][0])

X = vectorizer.fit_transform(sentences)

# Vocabulary
print(vectorizer.get_feature_names_out())


['about' 'accustomed' 'after' 'agenda' 'agreements' 'all' 'an' 'and'
 'appeal' 'are' 'around' 'as' 'audiences' 'away' 'awaybr' 'be' 'become'
 'being' 'bitches' 'br' 'brutality' 'but' 'called' 'can' 'cells' 'charm'
 'christians' 'city' 'class' 'classic' 'comfortable' 'couldnt' 'crooked'
 'dare' 'darker' 'dealings' 'death' 'developed' 'dodgy' 'doesnt' 'drugs'
 'due' 'em' 'emerald' 'episode' 'ever' 'exactly' 'experience'
 'experimental' 'face' 'fact' 'faint' 'far' 'first' 'focuses' 'for'
 'forget' 'from' 'fronts' 'gangstas' 'get' 'given' 'glass' 'go' 'goes'
 'got' 'graphic' 'guards' 'happened' 'hardcore' 'has' 'have' 'hearted'
 'high' 'home' 'hooked' 'if' 'in' 'injustice' 'inmates' 'into' 'inwards'
 'irish' 'is' 'it' 'italians' 'its' 'just' 'kill' 'lack' 'latinos'
 'levels' 'main' 'mainly' 'mainstream' 'mannered' 'manyaryans' 'maximum'
 'may' 'me' 'mebr' 'mentioned' 'mess' 'middle' 'more' 'moreso' 'muslims'
 'nasty' 'never' 'nickel' 'nickname' 'no' 'not' 'of' 'on' 'one' 'or'
 'order' 'osw

In [48]:
# Bag of words
print(X.toarray())
print(X.toarray().shape)
print(len(vectorizer.get_feature_names_out()))

[[ 1  1  1  1  1  1  1  6  1  2  1  4  1  1  1  2  1  1  1  3  1  2  1  1
   1  1  1  2  1  1  1  1  1  1  1  1  1  1  1  1  1  2  1  1  2  1  1  1
   1  1  1  1  1  2  1  5  3  1  1  1  2  1  1  1  1  1  1  1  1  1  1  1
   1  2  1  1  1  3  1  2  1  1  1  9  6  1  2  2  1  1  1  1  1  1  1  1
   1  1  1  3  1  1  1  1  1  1  1  1  1  1  1  1  3  7  3  1  3  1  1  2
   1  5  1  1  1  1  3  1  1  1  1  1  1  2  1  1  2  1  1  1  1  1  1  1
   3  1  1  1  2  1  1  1  1  2  1  1  4 16  1  1  1  3  1  6  1  1  1  1
   1  1  1  4  3  1  2  1  2  2  1  2  5  1  1  1  1  2  1  1]]
(1, 188)
188


In [51]:
# Problem 5
# Apply bag of bi-gram and bag of tri-gram and write down your observation about the dimensionality of the vocabulary

from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import sent_tokenize

# bi-gram
vectorizer = CountVectorizer(ngram_range=(2,2))
sentences = sent_tokenize(df['review'][0])

X = vectorizer.fit_transform(sentences)

# Vocabulary
print(vectorizer.get_feature_names_out())


['about oz' 'accustomed to' 'after watching' 'agenda em' 'agreements are'
 'all the' 'an experimental' 'and face' 'and get' 'and got' 'and moreso'
 'and shady' 'and unflinching' 'appeal of' 'are never' 'are right'
 'around the' 'as so' 'as that' 'as this' 'as watched' 'audiences forget'
 'away with' 'awaybr br' 'be hooked' 'be sold' 'become comfortable'
 'being turned' 'bitches due' 'br it' 'br the' 'br would' 'brutality and'
 'but as' 'but injustice' 'called oz' 'can get' 'cells have'
 'charm forget' 'christians italians' 'city an' 'city is' 'class inmates'
 'classic use' 'comfortable with' 'couldnt say' 'crooked guards'
 'dare forget' 'darker side' 'dealings and' 'death stares'
 'developed taste' 'dodgy dealings' 'doesnt mess' 'drugs sex' 'due to'
 'em city' 'emerald city' 'episode ever' 'episode youll' 'ever saw'
 'exactly what' 'experience watching' 'experimental section'
 'face inwards' 'fact that' 'faint hearted' 'far awaybr' 'first episode'
 'first thing' 'focuses mainly' 'for i

In [52]:
print(X.toarray())
print(X.toarray().shape)
print(len(vectorizer.get_feature_names_out()))

[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 3 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]]
(1, 286)
286


In [59]:
# tri-gram
vectorizer = CountVectorizer(ngram_range=(3,3))
sentences = sent_tokenize(df['review'][0])

X = vectorizer.fit_transform(sentences)

# Vocabulary
print(vectorizer.get_feature_names_out())

['about oz was' 'accustomed to the' 'after watching just' 'agenda em city'
 'agreements are never' 'all the cells' 'an experimental section'
 'and face inwards' 'and get away' 'and got accustomed'
 'and moreso scuffles' 'and shady agreements' 'and unflinching scenes'
 'appeal of the' 'are never far' 'are right as' 'around the first'
 'as so nasty' 'as that is' 'as this is' 'as watched more'
 'audiences forget charm' 'away with it' 'awaybr br would'
 'be hooked they' 'be sold out' 'become comfortable with'
 'being turned into' 'bitches due to' 'br it is' 'br the first'
 'br would say' 'brutality and unflinching' 'but as watched'
 'but injustice crooked' 'called oz as' 'can get in' 'cells have glass'
 'charm forget romanceoz' 'christians italians irish'
 'city an experimental' 'city is home' 'class inmates being'
 'classic use of' 'comfortable with what' 'couldnt say was'
 'crooked guards wholl' 'dare forget pretty' 'dealings and shady'
 'death stares dodgy' 'developed taste for' 'dodgy 

In [60]:
print(X.toarray())
print(X.toarray().shape)
print(len(vectorizer.get_feature_names_out()))

[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1]]
(1, 295)
295


In [64]:
# Problem 6
# Apply tf-idf and find out the idf scores of words, also find out the vocabulary.

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize

# bi-gram
vectorizer = TfidfVectorizer(ngram_range=(2,2))
sentences = sent_tokenize(df['review'][0])

X_tf_idf = vectorizer.fit_transform(sentences)

# Vocabulary
print(vectorizer.get_feature_names_out())

['about oz' 'accustomed to' 'after watching' 'agenda em' 'agreements are'
 'all the' 'an experimental' 'and face' 'and get' 'and got' 'and moreso'
 'and shady' 'and unflinching' 'appeal of' 'are never' 'are right'
 'around the' 'as so' 'as that' 'as this' 'as watched' 'audiences forget'
 'away with' 'awaybr br' 'be hooked' 'be sold' 'become comfortable'
 'being turned' 'bitches due' 'br it' 'br the' 'br would' 'brutality and'
 'but as' 'but injustice' 'called oz' 'can get' 'cells have'
 'charm forget' 'christians italians' 'city an' 'city is' 'class inmates'
 'classic use' 'comfortable with' 'couldnt say' 'crooked guards'
 'dare forget' 'darker side' 'dealings and' 'death stares'
 'developed taste' 'dodgy dealings' 'doesnt mess' 'drugs sex' 'due to'
 'em city' 'emerald city' 'episode ever' 'episode youll' 'ever saw'
 'exactly what' 'experience watching' 'experimental section'
 'face inwards' 'fact that' 'faint hearted' 'far awaybr' 'first episode'
 'first thing' 'focuses mainly' 'for i

In [65]:
print(X_tf_idf.toarray())
print(X_tf_idf.toarray().shape)
print(len(vectorizer.get_feature_names_out()))

[[0.05555556 0.05555556 0.05555556 0.05555556 0.05555556 0.05555556
  0.05555556 0.05555556 0.05555556 0.05555556 0.05555556 0.05555556
  0.05555556 0.05555556 0.05555556 0.05555556 0.05555556 0.05555556
  0.05555556 0.05555556 0.05555556 0.05555556 0.05555556 0.05555556
  0.05555556 0.05555556 0.05555556 0.05555556 0.05555556 0.05555556
  0.05555556 0.05555556 0.05555556 0.05555556 0.05555556 0.05555556
  0.05555556 0.05555556 0.05555556 0.05555556 0.05555556 0.05555556
  0.05555556 0.05555556 0.05555556 0.05555556 0.05555556 0.05555556
  0.05555556 0.05555556 0.05555556 0.05555556 0.05555556 0.05555556
  0.05555556 0.11111111 0.05555556 0.05555556 0.05555556 0.05555556
  0.05555556 0.05555556 0.05555556 0.05555556 0.05555556 0.05555556
  0.05555556 0.05555556 0.05555556 0.05555556 0.05555556 0.05555556
  0.05555556 0.05555556 0.05555556 0.05555556 0.05555556 0.05555556
  0.05555556 0.05555556 0.05555556 0.05555556 0.05555556 0.05555556
  0.05555556 0.05555556 0.05555556 0.05555556 0.

In [66]:
print(vectorizer.idf_)

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
