In [2]:
import pandas as pd
import numpy as np
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
import joblib


In [3]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
df = pd.read_csv("IMDB Dataset.csv")   # change only if filename differs
df = df.dropna().reset_index(drop=True)

df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
stop_words = set(stopwords.words('english'))

custom_stopwords = {
    "movie", "film", "watch", "watching", "scene",
    "one", "two", "also", "get", "got", "make", "made",
    "like", "really", "thing", "even", "say", "see",
    "ive", "dont", "im", "would", "could", "still",
    "well", "much", "many", "lot"
     "br", "itbr", "moviebr", "brbr"
}

lemmatizer = WordNetLemmatizer()


In [6]:
def clean_text(text):
    # remove HTML tags like <br />
    text = re.sub(r'<.*?>', ' ', text)

    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)

    tokens = text.split()
    tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in stop_words
        and word not in custom_stopwords
        and word != "br"          # ðŸ”¥ extra safety
    ]

    return " ".join(tokens)


In [7]:
df['clean_text'] = df['review'].apply(clean_text)

df[['review', 'clean_text']].head()


Unnamed: 0,review,clean_text
0,One of the other reviewers has mentioned that ...,reviewer mentioned oz episode youll hooked rig...
1,A wonderful little production. <br /><br />The...,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,basically there family little boy jake think t...
4,"Petter Mattei's ""Love in the Time of Money"" is...",petter matteis love time money visually stunni...


In [8]:
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.90,
    min_df=10,
    max_features=6000,
    ngram_range=(1, 2)   #  BIGRAMS
)

tfidf_matrix = tfidf_vectorizer.fit_transform(df['clean_text'])

tfidf_matrix.shape


(50000, 6000)

In [9]:
# Save cleaned data
df.to_csv("preprocessed_imdb.csv", index=False)

# Save TF-IDF objects
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")
joblib.dump(tfidf_matrix, "tfidf_matrix.pkl")


['tfidf_matrix.pkl']