In [101]:
import os
import json
from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [78]:
kaggle=json.load(open("kaggle.json"))

In [79]:
os.environ["KAGGLE_USERNAME"]=kaggle["username"]
os.environ["KAGGLE_KEY"]=kaggle["key"]


In [80]:
!kaggle datasets download -d volodymyrgavrysh/imdb-sentiment-10k-reviews-binary-classification



Dataset URL: https://www.kaggle.com/datasets/volodymyrgavrysh/imdb-sentiment-10k-reviews-binary-classification
License(s): CC0-1.0
imdb-sentiment-10k-reviews-binary-classification.zip: Skipping, found more recently modified local copy (use --force to force download)


In [81]:
!ls

imdb_10K_sentimnets_reviews.csv			      kaggle.json
imdb-sentiment-10k-reviews-binary-classification.zip  sample_data


In [82]:
with ZipFile("imdb-sentiment-10k-reviews-binary-classification.zip","r") as zip_ref:
  zip_ref.extractall()

In [83]:
!ls

imdb_10K_sentimnets_reviews.csv			      kaggle.json
imdb-sentiment-10k-reviews-binary-classification.zip  sample_data


In [84]:
df=pd.read_csv("imdb_10K_sentimnets_reviews.csv")
df.head()

Unnamed: 0,review,sentiment
0,"Okay, I know this does'nt project India in a g...",1
1,Despite John Travolta's statements in intervie...,0
2,"I am a kung fu fan, but not a Woo fan. I have ...",1
3,He seems to be a control freak. I have heard h...,0
4,"Admittedly, there are some scenes in this movi...",1


In [85]:
df.shape

(10000, 2)

In [87]:
#counting pos and neg reviews
df["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,5037
0,4963


In [88]:
#spliting data
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [90]:

# Tokenize text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)

In [93]:
#training
Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]

In [94]:
print(Y_train)

9254    1
1561    0
1670    1
6087    1
6669    0
       ..
5734    0
5191    0
5390    1
860     1
7270    1
Name: sentiment, Length: 8000, dtype: int64


In [104]:
# building the model
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=200),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation="sigmoid")
])

In [108]:
model.build(input_shape=(None, 200))

model.summary()

In [109]:
#compiling
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [110]:
model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 679ms/step - accuracy: 0.6146 - loss: 0.6507 - val_accuracy: 0.7400 - val_loss: 0.5374
Epoch 2/5
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 691ms/step - accuracy: 0.8277 - loss: 0.4095 - val_accuracy: 0.8275 - val_loss: 0.3867
Epoch 3/5
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 663ms/step - accuracy: 0.8786 - loss: 0.3125 - val_accuracy: 0.8144 - val_loss: 0.4244
Epoch 4/5
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 714ms/step - accuracy: 0.9119 - loss: 0.2308 - val_accuracy: 0.8138 - val_loss: 0.4357
Epoch 5/5
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 687ms/step - accuracy: 0.9375 - loss: 0.1818 - val_accuracy: 0.7937 - val_loss: 0.5067


<keras.src.callbacks.history.History at 0x7dcd10a8f650>

In [None]:
#evaluating the model
loss,accuracy=model.evaluate(X_test,Y_test)
print("loss",loss)
print("accuracy",accuracy)

In [113]:
# a function to predict the sentiments
def sentimentpred(review):
  #tokenizing the review text to sequences
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen=200)
  pred = model.predict(padded_sequence)
  sentiment = "positive" if pred[0][0] > 0.5 else "negative"
  return sentiment

In [115]:
new_review = "It was really long and boring, nothing special."
sentiment = sentimentpred(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 416ms/step
The sentiment of the review is: negative


In [120]:
new_review = "AMAZING. we really enjoyed this film"
sentiment = sentimentpred(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
The sentiment of the review is: positive


In [122]:
new_review = "what was that? was it suppose to be horror..."
sentiment = sentimentpred(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
The sentiment of the review is: negative
