In [5]:
!pip install kaggle


Collecting kaggle
  Downloading kaggle-1.6.14.tar.gz (82 kB)
     ---------------------------------------- 0.0/82.1 kB ? eta -:--:--
     ---------------------------------------- 0.0/82.1 kB ? eta -:--:--
     ---- ----------------------------------- 10.2/82.1 kB ? eta -:--:--
     ---- ----------------------------------- 10.2/82.1 kB ? eta -:--:--
     -------------- ----------------------- 30.7/82.1 kB 262.6 kB/s eta 0:00:01
     -------------------------------------  81.9/82.1 kB 508.4 kB/s eta 0:00:01
     -------------------------------------- 82.1/82.1 kB 460.6 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py): started
  Building wheel for kaggle (setup.py): finished with status 'done'
  Created wheel for kaggle: filename=kaggle-1.6.14-py3-none-any.whl size=105130 sha256=0648694c522d8463404c52ba4a32e03a14b9588921172fc08b4056be7

**Importing the Dependencies**

In [89]:
import os
import json

from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

**Data Collection- Kaggle API**

In [None]:
kaggle_dictionary = json.load(open("kaggle.json"))

In [None]:
kaggle_dictionary.keys()

dict_keys(['username', 'key'])

In [None]:
# setup kaggle credentials as environment variables
os.environ["KAGGLE_USERNAME"] = kaggle_dictionary["username"]
os.environ["KAGGLE_KEY"] = kaggle_dictionary["key"]

In [None]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 97% 25.0M/25.7M [00:01<00:00, 33.7MB/s]
100% 25.7M/25.7M [00:01<00:00, 20.9MB/s]


In [None]:
!ls

imdb-dataset-of-50k-movie-reviews.zip  kaggle.json  sample_data


In [None]:
# unzip the dataset file
with ZipFile("imdb-dataset-of-50k-movie-reviews.zip", "r") as zip_ref:
  zip_ref.extractall()

In [None]:
!ls

'IMDB Dataset.csv'   imdb-dataset-of-50k-movie-reviews.zip   kaggle.json   sample_data


**Loading teh Dataset**

In [91]:
data.shape

(50000, 2)

In [92]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [93]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [94]:
data["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [95]:
data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

In [96]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [97]:
data["sentiment"].value_counts()

sentiment
1    25000
0    25000
Name: count, dtype: int64

In [86]:
# split data into training data and test data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [16]:
print(train_data.shape)
print(test_data.shape)

(40000, 2)
(10000, 2)


**Data Preprocessing**

In [17]:
# Tokenize text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)

In [18]:
print(X_train)

[[1935    1 1200 ...  205  351 3856]
 [   3 1651  595 ...   89  103    9]
 [   0    0    0 ...    2  710   62]
 ...
 [   0    0    0 ... 1641    2  603]
 [   0    0    0 ...  245  103  125]
 [   0    0    0 ...   70   73 2062]]


In [19]:
print(X_test)

[[   0    0    0 ...  995  719  155]
 [  12  162   59 ...  380    7    7]
 [   0    0    0 ...   50 1088   96]
 ...
 [   0    0    0 ...  125  200 3241]
 [   0    0    0 ... 1066    1 2305]
 [   0    0    0 ...    1  332   27]]


In [20]:
Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]

In [21]:
print(Y_train)

39087    0
30893    0
45278    1
16398    0
13653    0
        ..
11284    1
44732    1
38158    0
860      1
15795    1
Name: sentiment, Length: 40000, dtype: int64


**LSTM - Long Short-Term Memory**

In [24]:
# build the model

model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))

In [25]:
model.summary()

In [26]:
# compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [98]:
from sklearn.model_selection import KFold
import numpy as np

# Define the number of folds
k_folds = 5

# Initialize lists to store evaluation metrics for each fold
accuracy_scores = []
loss_scores = []

# Define the KFold cross-validation iterator
kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42)


In [99]:

# Iterate over each fold
for fold, (train_indices, val_indices) in enumerate(kfold.split(data)):
    print(f"Fold {fold+1}/{k_folds}")

    # Split the data into training and validation sets
    train_data_fold, val_data_fold = data.iloc[train_indices], data.iloc[val_indices]

    # Tokenization and padding for training and validation data
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(train_data_fold["review"])
    X_train = pad_sequences(tokenizer.texts_to_sequences(train_data_fold["review"]), maxlen=200)
    X_val = pad_sequences(tokenizer.texts_to_sequences(val_data_fold["review"]), maxlen=200)
    Y_train = train_data_fold["sentiment"]
    Y_val = val_data_fold["sentiment"]

    # Define and compile the model
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=128))
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

    # Train the model
    model.fit(X_train, Y_train, epochs=5, batch_size=32, verbose=1)

    # Evaluate the model on the validation set
    scores = model.evaluate(X_val, Y_val, verbose=0)
    accuracy_scores.append(scores[1])
    loss_scores.append(scores[0])

# Calculate and print average metrics across all folds
print("Average Accuracy:", np.mean(accuracy_scores))
print("Average Loss:", np.mean(loss_scores))


Fold 1/5
Epoch 1/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m281s[0m 218ms/step - accuracy: 0.7340 - loss: 0.5177
Epoch 2/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m286s[0m 189ms/step - accuracy: 0.8514 - loss: 0.3513
Epoch 3/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m265s[0m 192ms/step - accuracy: 0.8781 - loss: 0.2990
Epoch 4/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m236s[0m 189ms/step - accuracy: 0.8930 - loss: 0.2631
Epoch 5/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m241s[0m 192ms/step - accuracy: 0.9167 - loss: 0.2114
Fold 2/5
Epoch 1/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m240s[0m 188ms/step - accuracy: 0.7271 - loss: 0.5343
Epoch 2/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m238s[0m 190ms/step - accuracy: 0.8095 - loss: 0.4217
Epoch 3/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 190ms/step - accuracy: 0.89

**Training the Model**

In [27]:
model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 376ms/step - accuracy: 0.7222 - loss: 0.5316 - val_accuracy: 0.8509 - val_loss: 0.3543
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m181s[0m 361ms/step - accuracy: 0.8484 - loss: 0.3592 - val_accuracy: 0.8606 - val_loss: 0.3385
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 355ms/step - accuracy: 0.8670 - loss: 0.3146 - val_accuracy: 0.8630 - val_loss: 0.3361
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m189s[0m 378ms/step - accuracy: 0.8830 - loss: 0.2872 - val_accuracy: 0.8673 - val_loss: 0.3225
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 346ms/step - accuracy: 0.9125 - loss: 0.2260 - val_accuracy: 0.8730 - val_loss: 0.3313


<keras.src.callbacks.history.History at 0x1de1d02d450>

**Model Evaluation**

In [28]:
loss, accuracy = model.evaluate(X_test, Y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 77ms/step - accuracy: 0.8715 - loss: 0.3127
Test Loss: 0.3107053339481354
Test Accuracy: 0.8758000135421753


**Building a Predictive System**

In [101]:
def predict_sentiment(review):
  # tokenize and pad the review
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen=200)
  prediction = model.predict(padded_sequence)
  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [119]:
# example usage
new_review = "That movie was not at all disgusting as others were saying rather i think it's wonderful of it's own kind"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
The sentiment of the review is: positive


## tricky negative statements

In [117]:
# example usage
l1=["What a fantastic way to waste two hours of my life.","The visuals were impressive, but everything else was a mess.", "It wasn't all bad, but the bad parts were hard to overlook.","I've seen worse movies, but this one is definitely down there.","For a critically acclaimed film, it was surprisingly underwhelming.","If you don't mind terrible acting, you might find something to like here.", "The dialogue was tedious, although the set design was nice." "Even with its high production value, the storyline fell flat.","I had high hopes, but it failed to deliver on almost every front.", "It had potential, but ultimately, it was a letdown.","Despite its flaws, there's something oddly charming about this movie"]
for i in l1:
    predict_sentiment(i)
    print(f"The sentiment of the review {i} is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
The sentiment of the review What a fantastic way to waste two hours of my life. is: negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
The sentiment of the review The visuals were impressive, but everything else was a mess. is: negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
The sentiment of the review It wasn't all bad, but the bad parts were hard to overlook. is: negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
The sentiment of the review I've seen worse movies, but this one is definitely down there. is: negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
The sentiment of the review For a critically acclaimed film, it was surprisingly underwhelming. is: negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
The sentiment of the review If you don't mind terrible acti

In [124]:

new_review="it is a good movie."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
The sentiment of the review is: positive
