# Problem - 1

In [1]:
import numpy as np, pandas as pd

In [2]:
df=pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df['review']

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [4]:
# Convert all text in the 'review' column to lowercase
df['review'] = df['review'].str.lower()

df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [5]:
import re

# Remove HTML tags using regex
df['review'] = df['review'].apply(lambda x: re.sub(r'<.*?>', '', x))

df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [6]:
# Keep only letters and spaces
df['review'] = df['review'].apply(lambda x: re.sub(r'[^a-z\s]', '', x))

df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [7]:
# Remove extra spaces
df['review'] = df['review'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [8]:
# Spell checker (Faster, works on already preprocessed review column) textblob & pyspellchecker were taking too much time
from symspellpy import SymSpell, Verbosity
import time

# ---------------------
# 1. Use your existing DataFrame
# ---------------------
# Assuming `df` is already loaded & preprocessed (HTML removed, etc.)
# Do NOT reload CSV here

# ---------------------
# 2. Configure SymSpell
# ---------------------
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

# ---------------------
# 3. Load dictionary
# ---------------------
dictionary_path = "frequency_dictionary_en_82_765.txt"
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

# ---------------------
# 4. Spell-check in bulk (only review column)
# ---------------------
start_time = time.time()

def correct_text(text):
    corrected_words = []
    for word in text.split():
        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
        corrected_words.append(suggestions[0].term if suggestions else word)
    return " ".join(corrected_words)

# Apply only to review column, update in-place
df["review"] = df["review"].astype(str).apply(correct_text)

elapsed = time.time() - start_time
print(f"Spell-check completed in {elapsed:.2f} seconds")

# check first few rows
print(df["review"].head())

Spell-check completed in 198.43 seconds
0    one of ﻿the other reviewers has mentioned that...
1    a wonderful little production ﻿the filming tec...
2    i thought this was a wonderful way to spend ti...
3    basically there a family where a little boy ja...
4    petter matters love in ﻿the time of money is a...
Name: review, dtype: object


In [9]:
# Tokenization
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

# Tokenize into words
df['review'] = df['review'].apply(lambda x: word_tokenize(str(x)))
df.head()

[nltk_data] Downloading package punkt to C:\Users\Sushant
[nltk_data]     Shankar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,review,sentiment
0,"[one, of, ﻿the, other, reviewers, has, mention...",positive
1,"[a, wonderful, little, production, ﻿the, filmi...",positive
2,"[i, thought, this, was, a, wonderful, way, to,...",positive
3,"[basically, there, a, family, where, a, little...",negative
4,"[petter, matters, love, in, ﻿the, time, of, mo...",positive


In [10]:
# Remove stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df['review'] = df['review'].apply(lambda x: [word for word in x if word not in stop_words])

[nltk_data] Downloading package stopwords to C:\Users\Sushant
[nltk_data]     Shankar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# Lemmatization
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

df['review'] = df['review'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

df.head()

[nltk_data] Downloading package wordnet to C:\Users\Sushant
[nltk_data]     Shankar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,review,sentiment
0,"[one, ﻿the, reviewer, mentioned, watching, epi...",positive
1,"[wonderful, little, production, ﻿the, filming,...",positive
2,"[thought, wonderful, way, spend, time, hot, su...",positive
3,"[basically, family, little, boy, jake, think, ...",negative
4,"[petter, matter, love, ﻿the, time, money, visu...",positive


In [12]:
# Join back to string
df['review'] = df['review'].apply(lambda x: " ".join(x))
df.head()

Unnamed: 0,review,sentiment
0,one ﻿the reviewer mentioned watching episode '...,positive
1,wonderful little production ﻿the filming techn...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically family little boy jake think zombie ...,negative
4,petter matter love ﻿the time money visually st...,positive


# Problem - 2

In [13]:
# Combine all reviews into one big string
all_text = " ".join(df["review"])

# Split into words (basic split on whitespace)
all_words = all_text.split()

# Count totals
total_words = len(all_words)
vocab_size = len(set(all_words))

print(f"Total words in corpus: {total_words}")
print(f"Total unique words: {vocab_size}")

Total words in corpus: 6499619
Total unique words: 134177


# Problem - 3

In [14]:
# One-hot encode sentiment just for display
encoded_sentiment = pd.get_dummies(df["sentiment"], dtype=int)

# Combine with review column for display only
temp = pd.concat([df["review"], encoded_sentiment], axis=1)

# Show result
print(temp.head())

# if we want to encode words from review, that’s bag-of-words or TF-IDF, not pure one-hot.

                                              review  negative  positive
0  one ﻿the reviewer mentioned watching episode '...         0         1
1  wonderful little production ﻿the filming techn...         0         1
2  thought wonderful way spend time hot summer we...         0         1
3  basically family little boy jake think zombie ...         1         0
4  petter matter love ﻿the time money visually st...         0         1


# Problem - 4

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Create Bag of Words with stopwords removed
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['review'])

# Get vocabulary and word counts without converting to dense
word_counts = X.sum(axis=0).A1  # .A1 flattens sparse matrix to 1D array
vocab = vectorizer.get_feature_names_out()

# Create DataFrame of word frequencies
vocab_df = pd.DataFrame({'word': vocab, 'count': word_counts})
vocab_df = vocab_df.sort_values(by='count', ascending=False).reset_index(drop=True)

# Show top 10 words
print(vocab_df.head(10))

        word   count
0      movie  100607
1       film   90529
2       like   39860
3       time   29857
4       good   28753
5  character   27781
6      story   24252
7       make   23634
8     really   22935
9      scene   20822


# Problem - 5

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

# Bag of Bigrams
vectorizer_bigram = CountVectorizer(ngram_range=(2, 2), stop_words='english')
X_bigram = vectorizer_bigram.fit_transform(df['review'])
bigram_vocab_size = len(vectorizer_bigram.get_feature_names_out())

# Bag of Trigrams
vectorizer_trigram = CountVectorizer(ngram_range=(3, 3), stop_words='english')
X_trigram = vectorizer_trigram.fit_transform(df['review'])
trigram_vocab_size = len(vectorizer_trigram.get_feature_names_out())

print(f"Bigram vocabulary size: {bigram_vocab_size}")
print(f"Trigram vocabulary size: {trigram_vocab_size}")

Bigram vocabulary size: 2812585
Trigram vocabulary size: 4642245


**Observation on Dimensionality of the Vocabulary:**

When moving from unigrams (single words) to bigrams (two-word combinations) and trigrams (three-word combinations), the vocabulary size increases drastically.

* **Unigrams** only count unique words, so the vocabulary is relatively small (\~hundreds of thousands).
* **Bigrams** create a new feature for every unique two-word sequence found in the text. This leads to a massive jump in dimensionality because almost every word can pair with many others in different contexts.
* **Trigrams** amplify this even more, since every unique three-word sequence becomes a separate feature, resulting in millions of possible combinations.

This rapid growth in vocabulary size makes n-gram models more expressive but also much more memory-intensive, which can slow down processing and increase the risk of sparsity in the data.

# Problem - 6

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectorizer (removing English stopwords)
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform review column
X_tfidf = tfidf.fit_transform(df['review'])

# Get vocabulary (words) and their IDF scores
vocab = tfidf.get_feature_names_out()
idf_scores = tfidf.idf_

# Create DataFrame with words and IDF scores
idf_df = pd.DataFrame({'word': vocab, 'idf_score': idf_scores})

# Sort by highest IDF score (rarest words first)
idf_df = idf_df.sort_values(by='idf_score', ascending=False).reset_index(drop=True)

# Display top 10 rarest words
print("Top 10 rarest words by IDF score:")
print(idf_df.head(10))

# Display vocabulary size
print(f"\nVocabulary size: {len(vocab)}")

Top 10 rarest words by IDF score:
                              word  idf_score
0  zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz  11.126651
1       aaaaaaaaaaaahhhhhhhhhhhhhh  11.126651
2                      aaaaaaaargh  11.126651
3                         aaaaaaah  11.126651
4                 aaaaaaahhhhhhggg  11.126651
5                          aaaaagh  11.126651
6                           aaaaah  11.126651
7                         zurlinis  11.126651
8                          zumhofe  11.126651
9                        zuluagain  11.126651

Vocabulary size: 133959
