In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("IMDB Dataset.csv")
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [3]:
from bs4 import BeautifulSoup
import string

def clean(text):
    soup = BeautifulSoup(text)
    all_text = ''.join(soup.findAll(text = True))
    s = "123456789" + string.punctuation
    result = ''.join([i if i not in s else ' ' for i in all_text])
    result = result.replace('  ', ' ')
    return result

In [4]:
df.review = df.review.apply(clean)

In [5]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production The filming tech...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there s a family where a little boy ...,negative
4,Petter Mattei s Love in the Time of Money is a...,positive


In [6]:
df.dtypes

review       object
sentiment    object
dtype: object

In [7]:
# covnert sentiment to categorical data
df.sentiment = pd.Categorical(df.sentiment)
df.dtypes


review         object
sentiment    category
dtype: object

In [8]:
# encode sentiment categorical data into binary numbers
df.sentiment = df.sentiment.cat.codes
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production The filming tech...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there s a family where a little boy ...,0
4,Petter Mattei s Love in the Time of Money is a...,1


3. Convert the text to sequences

In [9]:
from keras.preprocessing.text import Tokenizer

In [10]:
# Define a Keras tokenizer
tokenizer = Tokenizer(num_words = 5000) 

In [11]:
from sklearn.model_selection import train_test_split

# split into train + validation, test
X_train, X_test, y_train, y_test = train_test_split(df.review, df.sentiment, test_size = 0.2, random_state = 42)

In [12]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten
from keras.datasets import imdb
from keras.callbacks import EarlyStopping

In [13]:
tokenizer.fit_on_texts(X_train) # Fit the tokenizer on the text


X_train = tokenizer.texts_to_sequences(X_train) # Convert the text to sequences
X_test = tokenizer.texts_to_sequences(X_test)

In [14]:
max_features = 20000
maxlen = 80 # cut texts after this number of words (among top max_features most common words)
batch_size = 128

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen = maxlen)
X_test = sequence.pad_sequences(X_test, maxlen = maxlen)

print('x_train shape:', X_train.shape)
print('x_test shape:', X_test.shape)

Pad sequences (samples x time)
x_train shape: (40000, 80)
x_test shape: (10000, 80)


In [15]:
print('Build model...')
model = Sequential()

# keras.layers.Embedding(input_dim, output_dim, embeddings_initializer='uniform', embeddings_regularizer=None, activity_regularizer=None, embeddings_constraint=None, mask_zero=False, input_length=None)
model.add(Embedding(input_dim = max_features, output_dim = 100, input_shape = (maxlen, ), trainable = True))

model.add(Flatten())

model.add(Dense(128))

model.add(Dense(1, activation = 'sigmoid'))

print(model.summary())


Build model...
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 80, 100)           2000000   
_________________________________________________________________
flatten (Flatten)            (None, 8000)              0         
_________________________________________________________________
dense (Dense)                (None, 128)               1024128   
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 3,024,257
Trainable params: 3,024,257
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
from keras.utils import plot_model

plot_model(model, to_file = 'model.png')

es = EarlyStopping(monitor = 'val_loss', mode = 'min', patience = 10)

# try using different optimizers and different optimizer configs
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

print('Train...')
model.fit(X_train, y_train, batch_size = batch_size, epochs = 5, validation_split = 0.2, callbacks = [es])
score, acc = model.evaluate(X_test, y_test, batch_size = batch_size)

('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')
Train...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.839043378829956
Test accuracy: 0.8180999755859375
