In [2]:
import pandas as pd  
import numpy as np    
from sklearn.model_selection import train_test_split     
from tensorflow.keras.preprocessing.text import Tokenizer  
from tensorflow.keras.preprocessing.sequence import pad_sequences  
from tensorflow.keras.models import Sequential     
from tensorflow.keras.layers import Embedding, LSTM, Dense 
from tensorflow.keras.callbacks import ModelCheckpoint   
from tensorflow.keras.models import load_model   
import re

# Load data

In [4]:
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Preprocess data

In [6]:
x_data = df['review']    
y_data = df['sentiment']  

x_data = x_data.replace({'<.*?>': ''}, regex = True)          
x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     
x_data = x_data.apply(lambda review: [w.lower() for w in review])  

y_data = y_data.replace('positive', 1)
y_data = y_data.replace('negative', 0)

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
20336    [a, c, t, u, a, l, l, y,  , i,  , m,  , s, u, ...
8500     [t, h, i, s,  , m, o, v, i, e,  , i,  , v, e, ...
34545    [m, y,  , w, i, f, e,  , a, n, d,  , i,  , e, ...
24651    [t, h, e,  , a, c, c, o, u, n, t, s,  , s, e, ...
4843     [t, i, m, e,  , a, n, d,  , t, i, m, e,  , a, ...
                               ...                        
37764    [w, h, i, l, e,  , n, o, t,  , a, s,  , b, a, ...
16512    [i,  , f, i, r, s, t,  , s, a, w,  , a,  , p, ...
18770    [i,  , w, a, s,  , s, o,  , l, o, o, k, i, n, ...
45047    [t, h, e,  , o, r, i, g, i, n, a, l,  , d, e, ...
45640    [w, h, a, t,  , a,  , s, h, a, m, e,  , t, h, ...
Name: review, Length: 40000, dtype: object 

6314     [i,  , d, o, n,  , t,  , l, i, k, e,  , s, e, ...
48750    [i,  , s, a, w,  , t, h, e,  , f, i, l, m,  , ...
8199     [t, h, e, r, e,  , a, r, e,  , s, i, m, i, l, ...
26910    [i,  , s, a, w,  , c, h, a, n,  , i, s,  , m, ...
29595    [m, i, l, d,  , s, p, o, i, l, e, r, s, i, n, ...
 

# Tokenize data

In [12]:
review_length = [len(x) for x in x_train]
max_length = int(np.ceil(np.mean(review_length)))

In [13]:
token = Tokenizer(lower=False)
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[ 4 13  3 ...  0  0  0]
 [ 3 10  5 ...  0  0  0]
 [14 18  1 ...  0  0  0]
 ...
 [ 5  1 19 ...  0  0  0]
 [ 3 10  2 ...  0  0  0]
 [19 10  4 ...  8  1 14]] 

Encoded X Test
 [[ 5  1 12 ...  0  0  0]
 [ 5  1  7 ...  0  0  0]
 [ 3 10  2 ... 12  2  7]
 ...
 [ 7 10  4 ...  0  0  0]
 [ 3 10  5 ...  0  0  0]
 [ 4  3  1 ...  0  0  0]] 

Maximum review length:  1288


# Build model

In [14]:
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())

Metal device set to: Apple M2

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1288, 32)          896       
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 25,793
Trainable params: 25,793
Non-trainable params: 0
_________________________________________________________________
None


In [15]:
model.fit(x_train, y_train, batch_size = 128, epochs = 5)

Epoch 1/5


2023-05-14 17:24:23.330262: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x176e34f40>

# Evaluate Model

In [16]:
y_pred = model.predict(x_test)



In [18]:
y_pred = [0 if x < 0.5 else 1 for x in y_pred]

In [21]:
accuracy = y_pred.count(1)/len(y_pred)
accuracy

0.9547