# Importing Libraries

In [17]:
import os
import json
import pandas as pd 
import tensorflow as tf

from zipfile import ZipFile
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [18]:
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# Data Collection from Kaggle API

In [7]:
kaggle_dictionary = json.load(open("../kaggle.json"))

In [13]:
# Set up kaggle credentials as environment variables
os.environ['KAGGLE_USERNAME'] = kaggle_dictionary['username']
os.environ['KAGGLE_KEY'] = kaggle_dictionary['key']

In [16]:
!kaggle datasets download lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to c:\Users\tanzi\OneDrive\Documents\ML\Project 7 Sentiment Analysis




  0%|          | 0.00/25.7M [00:00<?, ?B/s]
100%|██████████| 25.7M/25.7M [00:00<00:00, 799MB/s]


In [19]:
with ZipFile('imdb-dataset-of-50k-movie-reviews.zip', 'r') as zip_ref:
    zip_ref.extractall()

# Load Data

In [20]:
data = pd.read_csv('IMDB Dataset.csv')

In [22]:
data.shape

(50000, 2)

In [23]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [25]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [24]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [26]:
data.replace({"sentiment": {'positive': 1, 'negative': 0}}, inplace=True)

In [27]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [28]:
data['sentiment'].value_counts()

sentiment
1    25000
0    25000
Name: count, dtype: int64

# Split Data

In [29]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data['sentiment'])

In [30]:
print(train_data.shape)
print(test_data.shape)

(40000, 2)
(10000, 2)


# Data Preprocessing

In [31]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['review'])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['review']), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['review']), maxlen=200)

In [32]:
X_train

array([[   0,    0,    0, ...,    3, 3598,  157],
       [   0,    0,    0, ...,  103,    9,  554],
       [   0,    0,    0, ...,    1,  213,   27],
       ...,
       [   0,    0,    0, ...,  138,    5,    3],
       [   0,    0,    0, ...,   10,   98,  131],
       [   0,    0,    0, ...,  152,   63,    9]])

In [33]:
X_test

array([[   0,    0,    0, ...,    1,   86,   55],
       [   0,    0,    0, ..., 1193,    2,  340],
       [   0,    0,    0, ...,  269,  536, 1165],
       ...,
       [  10,  676,   12, ...,  104, 1380,   22],
       [   6,    3,   64, ...,   26,   77,   12],
       [   0,    0,    0, ...,  340,    7,    7]])

In [42]:
Y_train = train_data['sentiment']
Y_test = test_data['sentiment']

In [43]:
Y_train

47808    1
20154    0
43069    0
19413    0
13673    0
        ..
31092    1
22917    0
47481    0
35597    0
27491    0
Name: sentiment, Length: 40000, dtype: int64

# Building LSTM Model

In [37]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))



In [38]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 128)          640000    
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 771,713
Trainable params: 771,713
Non-trainable params: 0
_________________________________________________________________


In [39]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model

In [44]:
model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_split=0.2)  


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1e60e408b50>

# Model Evaluation

In [45]:
loss, accuracy = model.evaluate(X_test, Y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

Test Loss: 0.32640039920806885
Test Accuracy: 0.8798999786376953


# Building a Preductive System

In [46]:
def predict_sentiment(review):
    sequence = tokenizer.texts_to_sequences([review])
    padded_sequence = pad_sequences(sequence, maxlen=200)
    prediction = model.predict(padded_sequence)
    return "Positive" if prediction[0][0] > 0.5 else "Negative"

In [47]:
# Example usage
new_review = "I absolutely loved this movie! The plot was fantastic and the acting was superb."
print(predict_sentiment(new_review))

Positive


In [48]:
new_review = "This movie was terrible. I did not enjoy it at all."
print(predict_sentiment(new_review))

Negative


In [49]:
new_review = "It was a not that good move."
print(predict_sentiment(new_review))

Negative


# Save Model and Tokenizer

In [51]:
model.save('sentiment_analysis_model.h5')

In [53]:
import pickle

# Save tokenizer
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)