In [1]:
# Import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
import tensorflow as tf
from tensorflow import keras
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import LSTM

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
os.chdir("/content/drive/My Drive/")

In [4]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
# Remove punctuation
df['review'] = df['review'].str.translate(str.maketrans('', '', string.punctuation))

In [6]:
# Convert text to lowercase
df['review'] = df['review'].str.lower()

In [7]:
# Remove stop words
stop_words = set(stopwords.words('english'))
df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [8]:
# Stem the text
stemmer = PorterStemmer()
df['review'] = df['review'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

In [9]:
# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

In [10]:
# Data Overview
print("Training Set:")
print(train_df['sentiment'].value_counts())
print("\nTest Set:")
print(test_df['sentiment'].value_counts())

Training Set:
negative    17589
positive    17411
Name: sentiment, dtype: int64

Test Set:
positive    7589
negative    7411
Name: sentiment, dtype: int64


In [11]:
# Vectorize the text data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['review'])
y = np.array(df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0))

In [12]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
#Pad the sequences to a fixed length
max = 1000
X_train = sequence.pad_sequences(X_train.toarray(), maxlen=max)
X_test = sequence.pad_sequences(X_test.toarray(), maxlen=max)

In [14]:
# Define the model
model = Sequential()
model.add(Embedding(input_dim=len(vectorizer.vocabulary_), output_dim=32, input_length=max))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
model.add(LSTM(100))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [15]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
# Train the model
batch_size = 64
epochs = 10
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
