# **Movie Review Sentiment Analysis**

In [16]:
import kagglehub

In [17]:
imdb_dataset_path = kagglehub.dataset_download('lakshmi25npathi/imdb-dataset-of-50k-movie-reviews')
print(imdb_dataset_path)

/root/.cache/kagglehub/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/versions/1


In [18]:
import pandas as pd
import numpy as np

## **Data Exploration**

In [19]:
data = pd.read_csv('/root/.cache/kagglehub/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/versions/1/IMDB Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [20]:
data.shape

(50000, 2)

In [21]:
data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [22]:
data.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [23]:
data.duplicated().sum()

418

In [24]:
data.duplicated(subset=['review']).sum()

418

## **Data Cleaning**

In [25]:
data = data.drop_duplicates(subset=['review'], keep='first')

# Reset index after removing duplicates
data = data.reset_index(drop=True)

data.shape

(49582, 2)

In [26]:
data.duplicated().sum()

0

## **Text Preprocessing**

In [27]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [28]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [29]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove non-alphabet characters
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

In [30]:
X = data['review'].apply(preprocess_text)

In [31]:
y = data['sentiment'].map({'positive': 1, 'negative': 0})

## **Vectorizer**

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [33]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['review'])

## **Train Test Split**

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## **Model Training**

### **Logistic Regression**

In [36]:
from sklearn.linear_model import LogisticRegression

In [37]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [38]:
y_pred = log_reg.predict(X_test)

In [39]:
from sklearn.metrics import accuracy_score, f1_score

In [40]:
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}, F1-score: {f1:.2f}')

Accuracy: 0.89, F1-score: 0.89


### **Multinomial NB**

In [41]:
from sklearn.naive_bayes import MultinomialNB


In [42]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

In [43]:
nb_pred = nb.predict(X_test)

In [44]:
accuracy = accuracy_score(y_test, nb_pred)
f1 = f1_score(y_test, nb_pred)
print(f'Accuracy: {accuracy:.2f}, F1-score: {f1:.2f}')

Accuracy: 0.85, F1-score: 0.85


### **Random Forest Classifier**

In [45]:
from sklearn.ensemble import RandomForestClassifier

In [46]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [47]:
rf_pred = rf.predict(X_test)


In [48]:
accuracy = accuracy_score(y_test, rf_pred)
f1 = f1_score(y_test, rf_pred)
print(f'Accuracy: {accuracy:.2f}, F1-score: {f1:.2f}')

Accuracy: 0.83, F1-score: 0.83


## **Model Saving**

In [49]:
import pickle

In [50]:
with open("logistic_model.pkl", "wb") as model_file:
    pickle.dump(log_reg, model_file)

In [51]:
with open("tfidf_vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)