In [1]:
import os
import cv2
import pandas as pd
import math
import numpy as np
import warnings
warnings.filterwarnings("ignore")

train_file = "train_data_lstm.csv"
test_file = "test_data_lstm.csv"
MODEL_NAME = "trained_model_lstm.hdf5"

def load_data(file, direc="", sep=",", header=True):
    csv_path = os.path.join(direc, file)
    if header:
        return pd.read_csv(csv_path, sep=sep, index_col=False)
    else:
        return pd.read_csv(csv_path, sep=sep, index_col=False, header=None)
    

In [2]:
train_data = load_data(train_file)

In [3]:
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Sentiment
0,12030,25331,77550,29530,100950,100266,60307,94367,60162,58112,...,6715,68067,19381,95382,70521,79657,8895,33267,91198,0
1,77551,66326,26872,8372,21910,38042,94564,89805,41134,60162,...,42198,15660,78398,70521,9278,29662,56979,100133,33267,1
2,0,0,0,0,0,0,0,0,0,0,...,51596,12303,54934,18591,34435,81321,79657,33267,91198,0
3,0,0,0,0,0,0,0,0,0,0,...,73259,4943,34444,79657,52669,26476,67364,55571,24613,1
4,24351,52701,38067,26810,52608,5363,51639,78674,56036,34304,...,15105,54934,55379,28025,91457,81321,79657,100133,91198,1


In [4]:
test_data = load_data(test_file)

In [5]:
test_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Sentiment
0,31523,97993,83719,7854,51372,90328,2127,96036,82008,58112,...,7175,2208,18377,9167,8502,6821,42698,69134,56965,1
1,0,0,0,0,0,0,0,0,0,0,...,96858,30795,64810,184,1088,2378,73295,57381,78622,1
2,0,0,0,0,0,0,0,0,0,0,...,37710,74317,44001,83587,81362,59322,66714,89905,55571,0
3,23440,34874,84045,84637,100777,97429,55026,52710,64123,15702,...,35137,15105,60279,50629,5412,37918,81321,25372,43233,0
4,0,0,0,0,0,0,0,0,0,0,...,10550,67007,53005,38067,70377,50514,24180,57381,89286,0


In [6]:
train_labels = np.int16(train_data["Sentiment"].copy().values)
train_features = train_data.drop("Sentiment", axis=1)

test_labels = np.int16(test_data["Sentiment"].copy().values)
test_features = test_data.drop("Sentiment", axis=1)

In [11]:
from __future__ import print_function
import numpy as np

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.datasets import imdb
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, EarlyStopping

maxlen = train_features.shape[1]
batch_size = 32

x_train = sequence.pad_sequences(train_features.values, maxlen=maxlen)
x_test = sequence.pad_sequences(test_features.values, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

y_train = np.array(train_labels)
y_test = np.array(test_labels)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (25000, 100)
x_test shape: (25000, 100)
y_train shape: (25000,)
y_test shape: (25000,)


In [12]:
max_features = 0
train_max_features = np.max(train_features.values.flatten())
test_max_features = np.max(test_features.values.flatten())

if max_features < train_max_features:
    max_features = train_max_features

if max_features < test_max_features:
    max_features = test_max_features 
    
max_features = 2 * max_features    

In [13]:
model = Sequential()
model.add(Embedding(max_features, 400, input_length=maxlen))
model.add(Bidirectional(LSTM(256)))
model.add(Dropout(0.8))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=4,
          validation_data=[x_test, y_test],
          callbacks = [ModelCheckpoint(MODEL_NAME, monitor='val_acc', verbose=1, save_best_only=True, save_weights_only=False, mode='max', period=1)])

Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/4

Epoch 00001: val_acc improved from -inf to 0.85724, saving model to trained_model_lstm.hdf5
Epoch 2/4

Epoch 00002: val_acc did not improve
Epoch 3/4
 1568/25000 [>.............................] - ETA: 5:00 - loss: 0.0721 - acc: 0.9847

KeyboardInterrupt: 