In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
import re

Using TensorFlow backend.


In [2]:
train_data = pd.read_csv('./data/tn_reviews.csv')

train_data.info

<bound method DataFrame.info of                                                  reviews  labels
0      russians never drop childrens toy fill explosi...       0
1      lady tramp ii colourfully animate songs especi...       1
2      could give movie less would certainly read rev...       0
3      dont think ive ever give something rat one eas...       0
4      funny bits come bill film quote zeitgeist keep...       0
...                                                  ...     ...
18517  western union something forget classic western...       1
18518  movie incredible piece work explore every nook...       1
18519  wife watch movie plan visit sicily stromboli s...       0
18520  first watch flatliners amaze necessary feature...       1
18521  would film good gross estimate award nominatio...       1

[18522 rows x 2 columns]>

In [3]:
valid_data = pd.read_csv('./data/vd_reviews.csv')

valid_data.info

<bound method DataFrame.info of                                                 reviews  labels
0     years since sharon stone award viewers legcros...       0
1     someone need make car payment truly awful make...       0
2     guidelines state comment must contain minimum ...       0
3     movie muddle mishmash clichÃ©s recent cinema pr...       0
4     stan laurel become smaller half alltime greate...       0
...                                                 ...     ...
4995  man love movie really take back kid days teach...       1
4996  recovery incredibly move piece work handle dev...       1
4997  take crook joint seem exceedingly difficult ta...       1
4998  futz show preserve experimental theatre moveme...       1
4999  mother tell recently widow mids mother two adu...       1

[5000 rows x 2 columns]>

In [4]:
test_data = pd.read_csv('./data/tt_reviews.csv')

test_data.info

<bound method DataFrame.info of                                                 reviews  labels
0     always write series complete stinkfest jim bel...       0
1     st watch dirsteve purcell typical mary kate as...       0
2     movie poorly write direct fell asleep minutes ...       0
3     interest thing miryang secret sunshine actors ...       1
4     first read berlin meer didnt expect much think...       0
...                                                 ...     ...
4995  kind picture john lassiter would make today we...       1
4996  must see saw whip press screen hilarious talk ...       1
4997  nbc ashamed wouldnt allow children see definit...       0
4998  movie clumsy mishmash various ghoststory suspe...       0
4999  formula movie illegitimate son rich chilenian ...       0

[5000 rows x 2 columns]>

In [5]:
max_fatures = 2000

In [6]:
tokenizer = Tokenizer(num_words=max_fatures, split=' ')

In [7]:
all_reviews = train_data['reviews']

all_reviews = all_reviews.append(valid_data['reviews'], ignore_index=True)
all_reviews = all_reviews.append(test_data['reviews'], ignore_index=True)

In [8]:
tokenizer.fit_on_texts(all_reviews.values)

In [9]:
train_x = tokenizer.texts_to_sequences(train_data['reviews'].values)

train_x = pad_sequences(train_x)

train_y = pd.get_dummies(train_data['labels']).values

train_x.shape

(18522, 844)

In [10]:
valid_x = tokenizer.texts_to_sequences(valid_data['reviews'].values)

valid_x = pad_sequences(valid_x, maxlen=train_x.shape[1])

valid_y = pd.get_dummies(valid_data['labels']).values

valid_x.shape

(5000, 844)

In [11]:
test_x = tokenizer.texts_to_sequences(test_data['reviews'].values)

test_x = pad_sequences(test_x, maxlen=train_x.shape[1])

test_y = pd.get_dummies(test_data['labels']).values

test_x.shape

(5000, 844)

In [12]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = train_x.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 844, 128)          256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 844, 128)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
batch_size = 32
model.fit(
    train_x, train_y, 
    epochs = 15, 
    batch_size = batch_size, 
    verbose = 2,
    validation_data=(valid_x, valid_y)
)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 18522 samples, validate on 5000 samples
Epoch 1/15
 - 464s - loss: 0.4586 - accuracy: 0.7810 - val_loss: 0.3526 - val_accuracy: 0.8554
Epoch 2/15
 - 458s - loss: 0.3685 - accuracy: 0.8451 - val_loss: 0.3584 - val_accuracy: 0.8462
Epoch 3/15
 - 461s - loss: 0.3274 - accuracy: 0.8658 - val_loss: 0.3526 - val_accuracy: 0.8496
Epoch 4/15
 - 460s - loss: 0.2939 - accuracy: 0.8816 - val_loss: 0.3551 - val_accuracy: 0.8554
Epoch 5/15
 - 462s - loss: 0.2694 - accuracy: 0.8935 - val_loss: 0.3501 - val_accuracy: 0.8540
Epoch 6/15
 - 469s - loss: 0.2562 - accuracy: 0.8963 - val_loss: 0.3623 - val_accuracy: 0.8604
Epoch 7/15
 - 464s - loss: 0.2412 - accuracy: 0.9034 - val_loss: 0.3630 - val_accuracy: 0.8578
Epoch 8/15
 - 463s - loss: 0.2274 - accuracy: 0.9109 - val_loss: 0.3807 - val_accuracy: 0.8648
Epoch 9/15
 - 471s - loss: 0.2155 - accuracy: 0.9142 - val_loss: 0.3923 - val_accuracy: 0.8598
Epoch 10/15
 - 470s - loss: 0.1910 - accuracy: 0.9265 - val_loss: 0.4076 - val_accuracy: 0.8514


<keras.callbacks.callbacks.History at 0x7fb177e06b10>

In [14]:
model.evaluate(test_x, test_y, verbose = 2, batch_size = batch_size)

[0.46003970744609834, 0.850600004196167]

In [15]:
model_json = model.to_json()

In [16]:
with open("output/model.json", "w") as json_file:
    json_file.write(model_json)

In [17]:
model.save_weights("output/model.h5")