In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
# https://machinelearningmastery.com/predict-sentiment-movie-reviews-using-deep-learning/
from keras.datasets import imdb
top_words = 5000
max_words = 500

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=top_words)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

Using TensorFlow backend.


25000 train sequences
25000 test sequences


In [3]:
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

x_train shape: (25000,)
x_test shape: (25000,)


In [4]:
import numpy
X = numpy.concatenate((x_train, x_test), axis=0)
# Summarize number of words
print("Number of words: ")
print(len(numpy.unique(numpy.hstack(X))))

Number of words: 
4998


In [5]:
from keras.preprocessing import sequence
X_train = sequence.pad_sequences(x_train, maxlen=max_words)
X_test = sequence.pad_sequences(x_test, maxlen=max_words)

In [6]:
print('x_train shape:', X_train.shape)
print('x_test shape:', X_test.shape)

x_train shape: (25000, 500)
x_test shape: (25000, 500)


In [7]:
from sklearn.model_selection import train_test_split
X_temp = X_train
y_temp = y_train
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size = 0.16)

len(X_train) # 60, 20, 20 split

21000

In [8]:
import pickle

preprocessed_data = (X_train, y_train, X_val, y_val, X_test, y_test)

pickle_out = open("preprocessed_CNN.p","wb")
pickle.dump(preprocessed_data, pickle_out)
pickle_out.close()
print('done')

done


In [9]:
import pickle
pickle_in = open("preprocessed_CNN.p","rb")
preprocessed_data = pickle.load(pickle_in)
X_train, y_train, X_val, y_val, X_test, y_test = preprocessed_data

In [10]:
# CNN for the IMDB problem
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam

In [11]:
# create the model
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
adam = Adam(lr=1e-3)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
callbacks = [EarlyStopping(monitor='val_loss')]
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 500, 32)           3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 250, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 8000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               2000250   
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 251       
Total params: 2,163,605
Trainable params: 2,163,605
Non-trainable params: 0
____________________________________________

In [12]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=128, verbose=2, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 21000 samples, validate on 25000 samples
Epoch 1/10
 - 18s - loss: 0.4550 - accuracy: 0.7623 - val_loss: 0.3185 - val_accuracy: 0.8640
Epoch 2/10
 - 17s - loss: 0.2168 - accuracy: 0.9165 - val_loss: 0.2887 - val_accuracy: 0.8782
Epoch 3/10
 - 17s - loss: 0.1540 - accuracy: 0.9449 - val_loss: 0.3114 - val_accuracy: 0.8738


<keras.callbacks.callbacks.History at 0x641f19610>

In [13]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 87.38%
