# Sentiment Analysis on the IMDB Dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/250.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m245.8/250.9 kB[0m [31m7.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.9/250.9 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5


In [6]:
path="/content/drive/MyDrive/Deep Learning/IMDB Dataset.xlsx"
df = pd.read_excel(path)

In [7]:
df.shape

(50000, 2)

In [8]:
df=df.iloc[:20000]

In [9]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [10]:
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [11]:
df['sentiment'][1]

'positive'

In [12]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
negative,10097
positive,9903


In [13]:
df.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [14]:
df.duplicated().sum()

np.int64(74)

In [15]:
df.drop_duplicates(inplace=True)

In [16]:
df.duplicated().sum()

np.int64(0)

In [17]:
# Basic Preprocessing
# Remove tags
# lowercase
# remove stopwords

In [18]:
import re
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
    return cleaned_text

In [19]:
df['review'] = df['review'].apply(remove_tags)

In [20]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
19995,"ok. for starters, taxi driver is amazing. this...",negative
19996,"It's sort of hard for me to say it, because I ...",negative
19997,I still liked it though. Warren Beatty is only...,positive
19998,We could still use Black Adder even today. Ima...,positive


In [21]:
# lambda arguments: expression

In [22]:
df['review'] = df['review'].apply(lambda x:x.lower())

In [23]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [24]:
from nltk.corpus import stopwords

sw_list = stopwords.words('english')
print(sw_list)
df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [25]:
# remove punctuation
import string
df['review'] = df['review'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [26]:
df

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically theres family little boy jake thinks...,negative
4,petter matteis love time money visually stunni...,positive
...,...,...
19995,ok starters taxi driver amazing this taxi driv...,negative
19996,sort hard say it greatly enjoyed targets paper...,negative
19997,still liked though warren beatty fair comic bo...,positive
19998,could still use black adder even today imagine...,positive


In [30]:
df['review'][1]

'wonderful little production filming technique unassuming oldtimebbc fashion gives comforting sometimes discomforting sense realism entire piece actors extremely well chosen michael sheen has got polari voices pat too truly see seamless editing guided references williams diary entries well worth watching terrificly written performed piece masterful production one great masters comedy life realism really comes home little things fantasy guard which rather use traditional dream techniques remains solid disappears plays knowledge senses particularly scenes concerning orton halliwell sets particularly flat halliwells murals decorating every surface terribly well done'

In [31]:
X = df.iloc[:,0:1]
y = df['sentiment']

In [32]:
X

Unnamed: 0,review
0,one reviewers mentioned watching 1 oz episode ...
1,wonderful little production filming technique ...
2,thought wonderful way spend time hot summer we...
3,basically theres family little boy jake thinks...
4,petter matteis love time money visually stunni...
...,...
19995,ok starters taxi driver amazing this taxi driv...
19996,sort hard say it greatly enjoyed targets paper...
19997,still liked though warren beatty fair comic bo...
19998,could still use black adder even today imagine...


In [33]:
y

Unnamed: 0,sentiment
0,positive
1,positive
2,positive
3,negative
4,positive
...,...
19995,negative
19996,negative
19997,positive
19998,positive


In [34]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

y = encoder.fit_transform(y)

In [35]:
y

array([1, 1, 1, ..., 1, 1, 0])

In [36]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [37]:
X_train.shape

(15940, 1)

In [38]:
X_test.shape

(3986, 1)

In [39]:
# Applying BoW
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [40]:
cv = CountVectorizer()
cv=TfidfVectorizer()

In [41]:
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

In [42]:
X_train_bow.shape

(15940, 109334)

In [43]:
print(X_train_bow[1])

[0. 0. 0. ... 0. 0. 0.]


In [44]:
X_train_bow[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [45]:
print(y_train[0])

1


In [46]:
(X_train_bow,y_train,X_test_bow,y_test)

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([1, 0, 0, ..., 0, 0, 1]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([1, 0, 0, ..., 0, 1, 1]))

In [55]:
# Save vectorizer
import pickle
with open("/content/drive/MyDrive/Deep Learning/vectorizer.pkl", "wb") as f:
    pickle.dump(cv, f)

In [48]:
pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3 (from tensorflow)
  Downloading protobuf-5.29.5-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Collecting tensorboard~=2.19.0 (from tensorflow)
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow-io-gcs-filesystem>=0.23.1 (from tensorf

In [49]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
# Save the full model (best only)
checkpoint_cb = ModelCheckpoint(
    filepath="/content/drive/MyDrive/Deep Learning/best_model.h5",          # Full model path
    monitor="val_loss",
    mode="min",
    save_best_only=True,              # Save only the best model
    verbose=1
)

In [50]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model

# Recreate model architecture
model = Sequential([
    Dense(64, input_dim=X_train_bow.shape[1], activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [51]:

model.fit(
    X_train_bow, y_train,
    epochs=10,
    batch_size=2,
    validation_data=(X_test_bow,y_test),
    callbacks=[
        EarlyStopping(monitor='val_loss', patience=2),
        checkpoint_cb
    ],
    verbose=1
)

Epoch 1/10
[1m7970/7970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170ms/step - accuracy: 0.8023 - loss: 0.4000
Epoch 1: val_loss improved from inf to 0.25275, saving model to /content/drive/MyDrive/Deep Learning/best_model.h5




[1m7970/7970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1373s[0m 172ms/step - accuracy: 0.8023 - loss: 0.4000 - val_accuracy: 0.8931 - val_loss: 0.2528
Epoch 2/10
[1m7970/7970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170ms/step - accuracy: 0.9746 - loss: 0.0795
Epoch 2: val_loss did not improve from 0.25275
[1m7970/7970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1362s[0m 171ms/step - accuracy: 0.9746 - loss: 0.0795 - val_accuracy: 0.8758 - val_loss: 0.3573
Epoch 3/10
[1m3142/7970[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m13:40[0m 170ms/step - accuracy: 0.9943 - loss: 0.0229

KeyboardInterrupt: 

In [52]:
import pickle
from tensorflow.keras.models import load_model
import numpy as np


In [54]:
# Load saved vectorizer
with open("vectorizer.pkl", "rb") as f:
    vectorizer = pickle.load(f)

# Load saved best model
model = load_model("/content/drive/MyDrive/Deep Learning/best_model.h5")

# Function to predict sentiment
def predict_sentiment(review: str):
    # Vectorize and convert to float32
    vec = vectorizer.transform([review]).toarray().astype('float32')

    # Predict
    prob = model.predict(vec)[0][0]
    label = "Positive" if prob > 0.5 else "Negative"

    print(f"Review: {review}")
    print(f"Sentiment: {label} (Confidence: {prob:.2f})")

# Test predictions
predict_sentiment("Encouraged by the positive comments about this film on here I was looking forward to watching this film. Bad mistake. I've seen 950+ films and this is truly one of the worst of them - it's awful in almost every way: editing, pacing, storyline, 'acting,' soundtrack (the film's only song - a lame country tune - is played no less than four times). The film looks cheap and nasty and is boring in the extreme. Rarely have I been so happy to see the end credits of a film. <br /><br />The only thing that prevents me giving this a 1-score is Harvey Keitel - while this is far from his best performance he at least seems to be making a bit of an effort. One for Keitel obsessives only.")
# predict_sentiment("It was a terrible and boring movie.")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
Review: Encouraged by the positive comments about this film on here I was looking forward to watching this film. Bad mistake. I've seen 950+ films and this is truly one of the worst of them - it's awful in almost every way: editing, pacing, storyline, 'acting,' soundtrack (the film's only song - a lame country tune - is played no less than four times). The film looks cheap and nasty and is boring in the extreme. Rarely have I been so happy to see the end credits of a film. <br /><br />The only thing that prevents me giving this a 1-score is Harvey Keitel - while this is far from his best performance he at least seems to be making a bit of an effort. One for Keitel obsessives only.
Sentiment: Negative (Confidence: 0.02)
