# 1. Setup libraries & Download dataset

## 1.1. Setup libraries

In [8]:
import pandas as pd
import numpy as np
import re
import string,time
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from textblob import TextBlob
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('punkt_tab')
nltk.download('omw-1.4')
nltk.download('stopwords')
# print("✅ Libraries imported successfully!")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 1.2. Download dataset

In [2]:
! gdown 1AbjSAlrlVxgHf4ONwzDe9OV0M7hlURxt

Downloading...
From: https://drive.google.com/uc?id=1AbjSAlrlVxgHf4ONwzDe9OV0M7hlURxt
To: /content/IMDB Dataset.csv
100% 66.2M/66.2M [00:00<00:00, 214MB/s]


# 2. Data Preprocessing

In [3]:
df=pd.read_csv('/content/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## 2.1. Processing function
- Converting all text to lowercase.
- Removing HTML tags.
- Removing URLs.
- Removing non-alphabetic characters.
- Removing extra spaces.
- Removing stopwords (commonly used words like "the", "is", etc. that carry little meaning in classification tasks).

In [4]:
# Define English stopwords
stop_words = set(stopwords.words('english'))

# Define preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Apply preprocessing to the 'review' column
df['clean_review'] = df['review'].apply(preprocess_text)

# Show sample after preprocessing
df[['review', 'clean_review']].head()

Unnamed: 0,review,clean_review
0,One of the other reviewers has mentioned that ...,one reviewers mentioned watching oz episode yo...
1,A wonderful little production. <br /><br />The...,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",petter matteis love time money visually stunni...


## 2.2. Convert Label

In [5]:
# Keep only Positive and Negative samples
df['sentiment'] = df['sentiment'].map({'negative': 0, 'positive': 1})

## 2.3. Split train/test

In [6]:

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Split into train (80%) and test (20%)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['sentiment'])


# Show sizes
print(f"Train size: {len(train_df)}")
print(f"Test size: {len(test_df)}")

Train size: 40000
Test size: 10000


# 3. Text Encoder

# 3.1. Apply word_tokenize

In [9]:
# Apply word_tokenize to the cleaned review column
train_df['tokens'] = train_df['clean_review'].apply(word_tokenize)
test_df['tokens'] = test_df['clean_review'].apply(word_tokenize)

train_df[['clean_review', 'tokens']].head()

Unnamed: 0,clean_review,tokens
47808,caught little gem totally accident back reviva...,"[caught, little, gem, totally, accident, back,..."
20154,cant believe let movie accomplish favor friend...,"[cant, believe, let, movie, accomplish, favor,..."
43069,spoiler alert gets nerve people remake use ter...,"[spoiler, alert, gets, nerve, people, remake, ..."
19413,theres one thing ive learnt watching george ro...,"[theres, one, thing, ive, learnt, watching, ge..."
13673,remember theaters reviews said horrible well d...,"[remember, theaters, reviews, said, horrible, ..."


## 2.2. TF-IDF

In [10]:
# Join the tokens back into full sentences (as TF-IDF expects raw text input)
train_texts = train_df['tokens'].apply(lambda x: ' '.join(x))
test_texts = test_df['tokens'].apply(lambda x: ' '.join(x))

# Initialize TF-IDF vectorizer with a maximum of 5000 features
tfidf = TfidfVectorizer(max_features=5000)

# Fit the vectorizer on training data and transform it
X_train_tfidf = tfidf.fit_transform(train_texts)
X_test_tfidf = tfidf.transform(test_texts)

# 4. ML Model - LogisticRegression

In [11]:
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


# Initialize Logistic Regression model
log_reg = LogisticRegression(max_iter=250, C=0.1, penalty='l2')

# Train the model
print("Training Model...")
log_reg.fit(X_train_tfidf, train_df['sentiment'])
print("Completed training!")

# >> Step 1: Save model <<
model_filename = 'logistic_regression_sentiment_model.joblib'
joblib.dump(log_reg, model_filename)
print(f"The model has been successfully saved to file. '{model_filename}'")


# Predict on training set
log_train_preds = log_reg.predict(X_train_tfidf)
train_accuracy = accuracy_score(train_df['sentiment'], log_train_preds)

# Predict on test set
log_test_preds = log_reg.predict(X_test_tfidf)
test_accuracy = accuracy_score(test_df['sentiment'], log_test_preds)

# Print results
print("\nLogistic Regression Accuracy:")
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

Training Model...
Completed training!
The model has been successfully saved to file. 'logistic_regression_sentiment_model.joblib'

Logistic Regression Accuracy:
Train Accuracy: 0.8804
Test Accuracy: 0.8726
