In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import pickle 
import os

keras = tf.keras
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, Dropout, add, concatenate
from keras.layers import LSTM, CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler

Using TensorFlow backend.


In [2]:
tf.__version__

'1.12.0'

# Import Data

In [3]:
def load_directory_data(directory):
    data={}
    data["sentence"] = []
    data["sentiment"] = []
    for file_path in os.listdir(directory):
        with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
            data["sentence"].append(f.read())
            data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
    return pd.DataFrame.from_dict(data)

def load_dataset(directory):
    pos_df = load_directory_data(directory + "/pos")
    neg_df = load_directory_data(directory + "/neg")
    pos_df["polarity"] = 1
    neg_df["polarity"] = 0
    return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

In [4]:
train_df = load_dataset("data/aclImdb/train/")
test_df = load_dataset("data/aclImdb/test/")

In [5]:
x_train = train_df['sentence']
y_train = train_df['polarity']
x_test = test_df['sentence']
y_test = test_df['polarity']

In [6]:
# tokenizer = text.Tokenizer()
# tokenizer.fit_on_texts(list(x_train) + list(x_test))

# with open('tokenizer_imdb.pickle', 'wb') as handle:
#     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
with open('tokenizer_imdb.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [8]:
max_length = max([len(s.split()) for s in (x_train + x_test)])
vocab_size = len(tokenizer.word_index) + 1

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=max_length)
x_test = sequence.pad_sequences(x_test, maxlen=max_length)

# Load Embeddings

In [37]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)

def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return embedding_matrix

In [38]:
EMBEDDING_FILES = [
    'embeddings/crawl-300d-2M.vec',
    'embeddings/glove.840B.300d.txt'
]

In [11]:
# embedding_matrix = np.concatenate(
#     [build_matrix(tokenizer.word_index, f) for f in EMBEDDING_FILES], axis=-1)
#  np.savetxt('embeddings_concat.txt', embedding_matrix, fmt='%d')

In [39]:
embedding_matrix = np.loadtxt('embeddings/embeddings_concat.txt', dtype=int)

# Define Model

In [13]:
BATCH_SIZE = 64
LSTM_UNITS = 128
DENSE_UNITS = LSTM_UNITS * 4

def build_model(embedding_matrix):
    words = Input(shape=(max_length,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, 
                           return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, 
                           return_sequences=True))(x)

    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])
    hidden = add([hidden, Dense(DENSE_UNITS, activation='relu')(hidden)])
    hidden = Dropout(0.2)(hidden)
    hidden = add([hidden, Dense(DENSE_UNITS, activation='relu')(hidden)])
    result = Dense(1, activation='sigmoid')(hidden)
    
    model = Model(inputs=words, outputs=result)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

    return model

# Train Model

In [15]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler

tbCallBack = keras.callbacks.TensorBoard(log_dir='/media/eigenstir/1TBSecondary/tbgraphs', histogram_freq=0, write_graph=True, write_images=True)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
lrs = LearningRateScheduler(lambda epoch: 1e-3 * (0.6 ** global_epoch))

In [17]:
EPOCHS = 4
NUM_MODELS = 2
weights = []

for model_idx in range(NUM_MODELS):
    model = build_model(embedding_matrix)
    for global_epoch in range(EPOCHS):
        model.fit(
        x_train,
        y_train,
        validation_data = (x_test, y_test),
        batch_size=BATCH_SIZE,
        epochs=1,
        callbacks=[tbCallBack,
                   es, lrs,
                  ModelCheckpoint('pre-deploy_models/imdbNonEager_bestModel{}.h5'.format(str(model_idx)), 
                     monitor='val_acc', 
                     mode='max', 
                     verbose=1, 
                     save_best_only=True)])
        model.save_weights('pre-deploy_models/imdbNonEager{}.h5'.format(str(model_idx)))
        weights.append(2 ** global_epoch)

        

Train on 25000 samples, validate on 25000 samples
Epoch 1/1

Epoch 00001: val_acc improved from -inf to 0.67392, saving model to pre-deploy_models/imdbNonEager_bestModel0.h5
Train on 25000 samples, validate on 25000 samples
Epoch 1/1

Epoch 00001: val_acc improved from -inf to 0.71732, saving model to pre-deploy_models/imdbNonEager_bestModel0.h5
Train on 25000 samples, validate on 25000 samples
Epoch 1/1

Epoch 00001: val_acc improved from -inf to 0.72488, saving model to pre-deploy_models/imdbNonEager_bestModel0.h5
Train on 25000 samples, validate on 25000 samples
Epoch 1/1

Epoch 00001: val_acc improved from -inf to 0.72852, saving model to pre-deploy_models/imdbNonEager_bestModel0.h5
Train on 25000 samples, validate on 25000 samples
Epoch 1/1

Epoch 00001: val_acc improved from -inf to 0.69072, saving model to pre-deploy_models/imdbNonEager_bestModel1.h5
Train on 25000 samples, validate on 25000 samples
Epoch 1/1

Epoch 00001: val_acc improved from -inf to 0.72212, saving model to p

# Simple LSTM

In [25]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler

tbCallBack = keras.callbacks.TensorBoard(log_dir='/media/eigenstir/1TBSecondary/tbgraphs', histogram_freq=0, write_graph=True, write_images=True)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=7)
lrs = LearningRateScheduler(lambda epoch: 1e-3 * (0.6 ** global_epoch))
mc = ModelCheckpoint('pre-deploy_models/imdbNonEager_LSTMbestModel.h5', 
                     monitor='val_acc', mode='max', 
                     verbose=1, 
                     save_best_only=True)

In [26]:
BATCH_SIZE = 64
LSTM_UNITS = 256
DENSE_UNITS = LSTM_UNITS * 4

def build_model(embedding_matrix):
    words = Input(shape=(max_length,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, 
                           return_sequences=True))(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(DENSE_UNITS, activation='relu')(x)
    result = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=words, outputs=result)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

    return model

In [27]:
model.fit(
        x_train,
        y_train,
        validation_data = (x_test, y_test),
        batch_size=BATCH_SIZE,
        epochs=30,
        callbacks=[tbCallBack, es, lrs, mc])
model.save_weights('pre-deploy_models/imdbNonEager_LSTM.h5')

Train on 25000 samples, validate on 25000 samples
Epoch 1/30

Epoch 00001: val_acc improved from -inf to 0.76660, saving model to pre-deploy_models/imdbNonEager_LSTMbestModel.h5
Epoch 2/30

Epoch 00002: val_acc improved from 0.76660 to 0.76800, saving model to pre-deploy_models/imdbNonEager_LSTMbestModel.h5
Epoch 3/30

Epoch 00003: val_acc did not improve from 0.76800
Epoch 4/30

Epoch 00004: val_acc did not improve from 0.76800
Epoch 5/30

Epoch 00005: val_acc did not improve from 0.76800
Epoch 6/30

Epoch 00006: val_acc did not improve from 0.76800
Epoch 7/30

Epoch 00007: val_acc did not improve from 0.76800
Epoch 8/30

Epoch 00008: val_acc did not improve from 0.76800
Epoch 00008: early stopping


In [None]:
# model.save_weights('pre-deploy_models/imdbNonEager.h5')

# With Pulled Data

In [4]:
train_df = load_dataset("data/aclImdb/train/")
test_df = load_dataset("data/aclImdb/test/")

In [5]:
li = []

for filename in os.listdir('data/mycollection'):
    df = pd.read_csv('data/mycollection/{}'.format(filename), sep='\t')
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

In [11]:
from sklearn.model_selection import train_test_split

x_train_frame, x_test_frame, y_train_frame, y_test_frame = train_test_split(frame['1'], frame['target'], test_size=0.1, random_state=42)

In [30]:
x_train = train_df['sentence']
y_train = train_df['polarity']
x_test = test_df['sentence']
y_test = test_df['polarity']

In [31]:
len(x_train)

25000

In [32]:
x_train = pd.concat([x_train,x_train_frame])
y_train = pd.concat([y_train,y_train_frame])
x_test = pd.concat([x_test,x_test_frame])
y_test = pd.concat([y_test, y_test_frame])

In [34]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(x_train) + list(x_test))

with open('tokenizer_imdb_merged.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [35]:
max_length = max([len(s.split()) for s in (x_train + x_test)])
vocab_size = len(tokenizer.word_index) + 1

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=max_length)
x_test = sequence.pad_sequences(x_test, maxlen=max_length)

## Simple LSTM

In [40]:
BATCH_SIZE = 64
LSTM_UNITS = 256
DENSE_UNITS = LSTM_UNITS * 4

def build_model(embedding_matrix):
    words = Input(shape=(max_length,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, 
                           return_sequences=True))(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(DENSE_UNITS, activation='relu')(x)
    result = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=words, outputs=result)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

    return model

In [41]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler

tbCallBack = keras.callbacks.TensorBoard(log_dir='/media/eigenstir/1TBSecondary/tbgraphs', histogram_freq=0, write_graph=True, write_images=True)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=7)
lrs = LearningRateScheduler(lambda epoch: 1e-3 * (0.6 ** global_epoch))
mc = ModelCheckpoint('pre-deploy_models/imdbNonEager_merged_LSTMbestModel.h5', 
                     monitor='val_acc', mode='max', 
                     verbose=1, 
                     save_best_only=True)

In [None]:
EPOCHS=30
model = build_model(embedding_matrix)

for global_epoch in range(EPOCHS):
    model.fit(
            x_train,
            y_train,
            validation_data = (x_test, y_test),
            batch_size=BATCH_SIZE,
            epochs=1,
            callbacks=[tbCallBack, es, lrs, mc])
model.save_weights('pre-deploy_models/imdbNonEager_merged_LSTM.h5')

Train on 25145 samples, validate on 25017 samples
Epoch 1/1

Epoch 00001: val_acc improved from -inf to 0.70380, saving model to pre-deploy_models/imdbNonEager_merged_LSTMbestModel.h5
Train on 25145 samples, validate on 25017 samples
Epoch 1/1

Epoch 00001: val_acc improved from 0.70380 to 0.75033, saving model to pre-deploy_models/imdbNonEager_merged_LSTMbestModel.h5
Train on 25145 samples, validate on 25017 samples
Epoch 1/1

Epoch 00001: val_acc improved from 0.75033 to 0.76136, saving model to pre-deploy_models/imdbNonEager_merged_LSTMbestModel.h5
Train on 25145 samples, validate on 25017 samples
Epoch 1/1

In [None]:
# for model_idx in range(NUM_MODELS):
#     model = build_model(embedding_matrix, y_aux_train.shape[-1])
#     for global_epoch in range(EPOCHS):
#         model.fit(
#             x_train,
#             [y_train, y_aux_train],
#             batch_size=BATCH_SIZE,
#             epochs=1,
#             callbacks=[
#                 LearningRateScheduler(lambda epoch: 1e-3 * (0.6 ** global_epoch))
#             ]
#         )