# Problem - 1

In [1]:
import numpy as np, pandas as pd

In [2]:
df=pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df['review']

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [4]:
# Convert all text in the 'review' column to lowercase
df['review'] = df['review'].str.lower()

df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [5]:
import re

# Remove HTML tags using regex
df['review'] = df['review'].apply(lambda x: re.sub(r'<.*?>', '', x))

df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [6]:
# Keep only letters and spaces
df['review'] = df['review'].apply(lambda x: re.sub(r'[^a-z\s]', '', x))

df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [7]:
# Remove extra spaces
df['review'] = df['review'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [8]:
# Spell checker (Faster, works on already preprocessed review column) textblob & pyspellchecker were taking too much time
from symspellpy import SymSpell, Verbosity
import time

# ---------------------
# 1. Use your existing DataFrame
# ---------------------
# Assuming `df` is already loaded & preprocessed (HTML removed, etc.)
# Do NOT reload CSV here

# ---------------------
# 2. Configure SymSpell
# ---------------------
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

# ---------------------
# 3. Load dictionary
# ---------------------
dictionary_path = "frequency_dictionary_en_82_765.txt"
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

# ---------------------
# 4. Spell-check in bulk (only review column)
# ---------------------
start_time = time.time()

def correct_text(text):
    corrected_words = []
    for word in text.split():
        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
        corrected_words.append(suggestions[0].term if suggestions else word)
    return " ".join(corrected_words)

# Apply only to review column, update in-place
df["review"] = df["review"].astype(str).apply(correct_text)

elapsed = time.time() - start_time
print(f"Spell-check completed in {elapsed:.2f} seconds")

# check first few rows
print(df["review"].head())

Spell-check completed in 196.78 seconds
0    one of ﻿the other reviewers has mentioned that...
1    a wonderful little production ﻿the filming tec...
2    i thought this was a wonderful way to spend ti...
3    basically there a family where a little boy ja...
4    petter matters love in ﻿the time of money is a...
Name: review, dtype: object


In [9]:
# Tokenization
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

df['review'] = df['review'].apply(lambda x: word_tokenize(x))

df.head()

[nltk_data] Downloading package punkt to C:\Users\Sushant
[nltk_data]     Shankar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,review,sentiment
0,"[one, of, ﻿the, other, reviewers, has, mention...",positive
1,"[a, wonderful, little, production, ﻿the, filmi...",positive
2,"[i, thought, this, was, a, wonderful, way, to,...",positive
3,"[basically, there, a, family, where, a, little...",negative
4,"[petter, matters, love, in, ﻿the, time, of, mo...",positive


In [10]:
# Remove stopwords
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x if word not in stop_words]))

[nltk_data] Downloading package stopwords to C:\Users\Sushant
[nltk_data]     Shankar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# Lemmatization
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

df['review'] = df['review'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

df.head()

[nltk_data] Downloading package wordnet to C:\Users\Sushant
[nltk_data]     Shankar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,review,sentiment
0,"[o, n, e, , ﻿, t, h, e, , r, e, v, i, e, w, ...",positive
1,"[w, o, n, d, e, r, f, u, l, , l, i, t, t, l, ...",positive
2,"[t, h, o, u, g, h, t, , w, o, n, d, e, r, f, ...",positive
3,"[b, a, s, i, c, a, l, l, y, , f, a, m, i, l, ...",negative
4,"[p, e, t, t, e, r, , m, a, t, t, e, r, s, , ...",positive


In [12]:
# Join back to string
df['review'] = df['review'].apply(lambda x: " ".join(x))
df.head()

Unnamed: 0,review,sentiment
0,o n e ﻿ t h e r e v i e w e r s m e n t ...,positive
1,w o n d e r f u l l i t t l e p r o d u c ...,positive
2,t h o u g h t w o n d e r f u l w a y s ...,positive
3,b a s i c a l l y f a m i l y l i t t l e ...,negative
4,p e t t e r m a t t e r s l o v e ﻿ t h ...,positive


# Problem - 2