In [28]:
import pandas as pd  
import numpy as np    
from sklearn.model_selection import train_test_split     
from tensorflow.keras.preprocessing.text import Tokenizer  
from tensorflow.keras.preprocessing.sequence import pad_sequences  
from tensorflow.keras.models import Sequential     
from tensorflow.keras.layers import Embedding, LSTM, Dense, GlobalMaxPooling1D
from tensorflow.keras.callbacks import ModelCheckpoint   
from tensorflow.keras.models import load_model   
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Conv2D

# Load data

In [2]:
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Preprocess data

In [3]:
x_data = df['review']    
y_data = df['sentiment']  

x_data = x_data.replace({'<.*?>': ''}, regex = True)          
x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     
x_data = x_data.apply(lambda review: [w.lower() for w in review])  

y_data = y_data.replace('positive', 1)
y_data = y_data.replace('negative', 0)

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
34194    [i,  , s, a, w,  , t, h, i, s,  , f, i, l, m, ...
48493    [d, i, s, t, a, s, t, e, f, u, l,  ,  , c, l, ...
48934    [t, h, i, s,  , h, a, s,  , g, o, t,  , t, o, ...
44182    [ , c, h, i, p, s,  ,  , i, s,  , a, n,  , e, ...
41795    [w, h, a, t,  , i, n,  , g, o, d,  , s,  , n, ...
                               ...                        
10155    [f, i, r, s, t, l, y,  ,  , i,  , w, o, u, l, ...
19368    [i,  , m,  , n, o, t,  , a, l, o, n, e,  , i, ...
3733     [i,  , m,  , s, o, r, r, y,  ,  , b, u, t,  , ...
12482    [i,  , s, a, w,  , t, h, i, s,  , m, o, v, i, ...
3089     [t, h, i, s,  , f, i, l, m,  , s, h, o, w, s, ...
Name: review, Length: 40000, dtype: object 

31497    [a, s,  , a,  , w, r, i, t, i, n, g,  , t, e, ...
2364     [o, n, e,  , o, f,  , m, y,  , d, e, s, i, r, ...
3878     [f, i, r, s, t,  , w, e, e, k,  , o, f,  , m, ...
9059     [f, u, l, l,  , m, a, r, k, s,  , f, o, r,  , ...
2340     [i,  , l, o, v, e,  , a, a, r, o, n,  , c, a, ...
 

# Tokenize data

In [18]:
review_length = [len(x) for x in x_train]
max_length = int(np.ceil(np.mean(review_length)))
max_length

1286

In [6]:
token = Tokenizer(lower=False)
token.fit_on_texts(x_train)

x_train = token.texts_to_sequences(x_train) # text to dynamic-size array of id
x_test = token.texts_to_sequences(x_test)

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post') # array of id to fixed-size array of id (size = 1286)
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[ 5  1  7 ...  0  0  0]
 [12  5  7 ...  0  0  0]
 [ 3 10  5 ...  0  0  0]
 ...
 [ 5  1 14 ...  0  0  0]
 [ 5  1  7 ...  0  0  0]
 [ 3 10  5 ...  0  0  0]] 

Encoded X Test
 [[ 4  7  1 ...  0  0  0]
 [ 6  8  2 ...  0  0  0]
 [16  5  9 ... 13  5  3]
 ...
 [ 5  9  9 ...  0  0  0]
 [ 1  1  1 ...  0  0  0]
 [ 3 12 18 ...  0  0  0]] 

Maximum review length:  1286


In [14]:
x_train.shape, total_words

((40000, 1286), 28)

# Build model

In [33]:
model = Sequential()
model.add(Embedding(total_words, output_dim=64, input_length = max_length)) # input: (None, 1286 words,)  -> output(None, 1286 words, 32 bytes each word)
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(24, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# compile model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 1286, 64)          1792      
                                                                 
 conv1d_4 (Conv1D)           (None, 1282, 128)         41088     
                                                                 
 global_max_pooling1d_3 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_7 (Dense)             (None, 24)                3096      
                                                                 
 dense_8 (Dense)             (None, 1)                 25        
                                                                 
Total params: 46,001
Trainable params: 46,001
Non-trainable params: 0
__________________________________________________

In [40]:
model.fit(x_train, y_train, batch_size = 128, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x4080307f0>

# Evaluate Model

In [41]:
y_pred = model.predict(x_test)



In [46]:
y_pred = [0 if x < 0.5 else 1 for x in y_pred]
accuracy = y_pred.count(1)/len(y_pred)
accuracy

0.4524