In [1]:
from __future__ import print_function
import os
import cv2
import pandas as pd
import math
import numpy as np
import warnings
warnings.filterwarnings("ignore")

train_file = "train_data_lstm.csv"
test_file = "test_data_lstm.csv"
MODEL_NAME = "trained_model_lstm.hdf5"

def load_data(file, direc="", sep=",", header=True):
    csv_path = os.path.join(direc, file)
    if header:
        return pd.read_csv(csv_path, sep=sep, index_col=False)
    else:
        return pd.read_csv(csv_path, sep=sep, index_col=False, header=None)
    

In [2]:
train_data = load_data(train_file)

In [3]:
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Sentiment
0,99187,2368,90203,51475,24966,62260,16205,63590,53094,76217,...,68811,14935,55817,79230,42445,61963,68850,76442,6285,0
1,33203,64713,28182,7070,24333,64043,85544,86493,26421,21093,...,96864,69006,31851,97929,4235,98062,73259,79230,35573,1
2,0,0,0,0,0,0,0,0,0,0,...,79825,74092,56950,47999,1183,85880,40416,20724,40419,0
3,48814,76438,11971,13458,90869,16009,29884,35267,26070,81912,...,92543,80160,75957,85808,85880,85125,19165,3730,70162,0
4,38407,83075,30942,69630,64068,100154,20913,94802,14091,37770,...,8899,79230,73259,30514,20193,61963,18991,45805,97918,0


In [4]:
test_data = load_data(test_file)

In [5]:
test_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Sentiment
0,0,0,0,0,0,0,0,0,0,0,...,100025,8964,32398,53540,79686,60609,37182,49600,31491,0
1,0,0,0,0,0,39969,71564,28984,1317,28653,...,31775,17864,79693,37716,96071,33358,5215,56950,98226,1
2,33203,73295,45741,40901,24333,49592,22821,19556,7360,30900,...,18835,1318,992,76239,85880,18790,31024,96036,6940,1
3,33203,57395,100745,53532,85914,25041,92199,18712,63679,78424,...,84150,71781,7972,15360,44303,55006,53540,85880,58,1
4,98112,22958,14091,60192,82544,42867,62002,68726,30441,66207,...,96071,39503,55006,99839,36245,85880,63518,3239,54646,0


In [6]:
train_labels = np.int16(train_data["Sentiment"].copy().values)
train_features = train_data.drop("Sentiment", axis=1)

test_labels = np.int16(test_data["Sentiment"].copy().values)
test_features = test_data.drop("Sentiment", axis=1)

In [7]:
import numpy as np

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.datasets import imdb
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import load_model

maxlen = train_features.shape[1]
batch_size = 32

x_train = sequence.pad_sequences(train_features.values, maxlen=maxlen)
x_test = sequence.pad_sequences(test_features.values, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

y_train = np.array(train_labels)
y_test = np.array(test_labels)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

Using TensorFlow backend.


x_train shape: (25000, 100)
x_test shape: (25000, 100)
y_train shape: (25000,)
y_test shape: (25000,)


In [8]:
max_features = 0
train_max_features = np.max(train_features.values.flatten())
test_max_features = np.max(test_features.values.flatten())

if max_features < train_max_features:
    max_features = train_max_features

if max_features < test_max_features:
    max_features = test_max_features 
    
max_features = 2 * max_features    

In [None]:
try:
    model = load_model(MODEL_NAME)
    print("Loaded saved model: " + MODEL_NAME)
except:
    print("Creating new model: " + MODEL_NAME)
    model = None

if model is None:
    model = Sequential()
    model.add(Embedding(max_features, 256, input_length=maxlen))
    model.add(Bidirectional(LSTM(128, dropout=0.9, recurrent_dropout=0.9)))
    model.add(Dropout(0.9))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=5,
          validation_data=[x_test, y_test],
          callbacks = [ModelCheckpoint(MODEL_NAME, monitor='val_acc', verbose=1, save_best_only=True, save_weights_only=False, mode='max', period=1)])

Creating new model: trained_model_lstm.hdf5
Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/5