In [3]:
from nltk.stem import WordNetLemmatizer
import pandas as pd
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

In [4]:
df = pd.read_csv('IMDB Dataset.csv')

In [6]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### EDA

In [7]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [8]:
#sentiment count
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

 dataset is balanced.

### Preprocessing

In [9]:
def clean_review(text):
    # 1. Remove HTML
    text = BeautifulSoup(text, "html.parser").get_text()
    # 2. Lowercase
    text = text.lower()
    # 3. Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text


In [10]:
#nltk.download('stopwords')
#nltk.download('wordnet')

In [11]:
def preprocess_data(text):
    # Clean the reviews
    text = clean_review(text)
    # Tokenize
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    # Removing negation words from stopwords as they are important for sentiment
    stop_words.difference_update(["no", "not", "nor"])
    words = [word for word in words if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(word, pos="v") for word in words] 

    return " ".join(lemmas)

In [12]:
# Apply preprocess_data and join tokens into strings
df['cleaned_reviews'] = df['review'].apply(preprocess_data)


In [13]:
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),  
    max_features=15000,  
    min_df=5,            
    max_df=0.7           
)

X = vectorizer.fit_transform(df['cleaned_reviews'])

In [14]:
y = df['sentiment']  

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
model = LogisticRegression()
model.fit(X_train, y_train)


In [17]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.91      0.89      0.90      4961
    positive       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [18]:
# Train a Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))

Naive Bayes Classification Report:
              precision    recall  f1-score   support

    negative       0.88      0.86      0.87      4961
    positive       0.87      0.88      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [19]:
# Train a Linear SVM model
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print("\nLinear SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))


Linear SVM Classification Report:
              precision    recall  f1-score   support

    negative       0.90      0.89      0.90      4961
    positive       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



### Applying Word2Vec

In [20]:
import gensim
import numpy as np
tokenized_reviews = df['cleaned_reviews'].apply(lambda r: r.split())

In [21]:
from gensim.models import Word2Vec

vector_size = 150  
window_size = 10   
min_word_count = 2 

# Train the Word2Vec model
w2v_model = Word2Vec(tokenized_reviews,
                                   vector_size=vector_size,
                                   window=window_size,
                                   min_count=min_word_count,
                                   workers=4) 

In [22]:
def vectorize_review(review_tokens, model):
    vectors = []
    for word in review_tokens:
        if word in model.wv:
            vectors.append(model.wv[word])

    if not vectors:
        return np.zeros(model.vector_size)

    return np.mean(vectors, axis=0)

X = np.array([vectorize_review(review, w2v_model) for review in tokenized_reviews])

y = df['sentiment']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.87      0.86      0.87      4961
    positive       0.86      0.88      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

