In [1]:
# Install necessary libraries if not already installed (uncomment if needed)
# !pip install tensorflow keras pandas numpy

import os
import re
import sys
import argparse

import keras.callbacks
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.models import Model
from keras.layers import Dense, Input, Dropout, MaxPooling1D, Conv1D, GlobalMaxPool1D
from keras.layers import LSTM, Lambda, concatenate, TimeDistributed, Bidirectional


In [2]:
maxlen = 512
max_sentences = 15
filter_length = [5, 3, 3]
nb_filter = [196, 196, 256]
pool_length = 2
char_embedding = 40
validation_split = 0.2

# Choose model variant
has_dense = True  # Set to False for the version without fully connected layers


In [3]:
class RecordLossHistory(keras.callbacks.Callback):
    def __init__(self):
        super(RecordLossHistory, self).__init__()
        self.accuracies = []
        self.losses = []

    def on_batch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))
        self.accuracies.append(logs.get('accuracy'))


In [4]:
def sentence_encoder_wo_dense(layer):
    for i in range(len(nb_filter)):
        layer = Conv1D(filters=nb_filter[i], kernel_size=filter_length[i], padding='valid', activation='relu',
                       kernel_initializer='glorot_normal', strides=1)(layer)
        layer = Dropout(0.1)(layer)
        layer = MaxPooling1D(pool_size=pool_length)(layer)

    bi_lstm_sent = Bidirectional(LSTM(128, return_sequences=False, dropout=0.1, recurrent_dropout=0.1))(layer)
    sentence_encode = Dropout(0.2)(bi_lstm_sent)
    encoder = Model(inputs=input_sentence, outputs=sentence_encode)
    encoder.summary()

    encoded = TimeDistributed(encoder)(document)
    bi_lstm_doc = Bidirectional(LSTM(128, return_sequences=False, dropout=0.1, recurrent_dropout=0.1))(encoded)
    output = Dropout(0.2)(bi_lstm_doc)
    output = Dense(128, activation='relu')(output)
    output = Dropout(0.2)(output)
    output = Dense(1, activation='sigmoid')(output)
    return output


In [5]:
def char_block(in_layer, filters, filter_len, subsample, pool_len):
    block = in_layer
    for i in range(len(filters)):
        block = Conv1D(filters=filters[i], kernel_size=filter_len[i], padding='valid', activation='tanh',
                       strides=subsample[i])(block)
        if pool_len[i]:
            block = MaxPooling1D(pool_size=pool_len[i])(block)
    block = GlobalMaxPool1D()(block)
    block = Dense(128, activation='relu')(block)
    return block


def sentence_encoder_with_dense(layer):
    block_2 = char_block(layer, filters=(128, 256), filter_len=(5, 5), subsample=(1, 1), pool_len=(2, 2))
    block_3 = char_block(layer, filters=(192, 320), filter_len=(7, 5), subsample=(1, 1), pool_len=(2, 2))

    sentence_encode = concatenate([block_2, block_3], axis=-1)
    encoder = Model(inputs=input_sentence, outputs=sentence_encode)
    encoder.summary()

    encoded = TimeDistributed(encoder)(document)

    lstm_h = 92
    lstm_layer1 = LSTM(lstm_h, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)(encoded)
    lstm_layer2 = LSTM(lstm_h, return_sequences=False, dropout=0.1, recurrent_dropout=0.1)(lstm_layer1)

    output = Dense(1, activation='sigmoid')(lstm_layer2)
    return output


In [6]:
def binarize(x, sz=71):
    return tf.cast(tf.one_hot(x, sz, on_value=1, off_value=0, axis=-1), tf.float32)


def binarize_outshape(in_shape):
    return (in_shape[0], in_shape[1], 71)


def remove_html(str_a):
    p = re.compile(r'<.*?>')
    return p.sub('', str_a)


def replace_non_ascii(str_a):
    return re.sub(r'[^\x00-\x7f]', r'', str_a)


In [7]:
input_data = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

txt = ''
reviews = []
sentiments = []
num_sent = []

for rev, sentiment in zip(input_data.review, input_data.sentiment):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', replace_non_ascii(remove_html(rev)))
    sentences = [sent.lower() for sent in sentences]
    reviews.append(sentences)
    sentiments.append(sentiment)

for rev in reviews:
    num_sent.append(len(rev))
    for s in rev:
        txt += s

chars = set(txt)
max_features = len(chars) + 1
print('Total # of  chars in dataset:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))


Total # of  chars in dataset: 71


In [8]:
X = np.ones((len(reviews), max_sentences, maxlen), dtype=np.int64) * -1
y = np.array(sentiments)

for i, doc in enumerate(reviews):
    for j, sentence in enumerate(doc):
        if j < max_sentences:
            for t, char in enumerate(sentence[-maxlen:]):
                X[i, j, (maxlen - 1 - t)] = char_indices.get(char, 0)

# shuffle and split
ids = np.arange(len(X))
np.random.shuffle(ids)
X = X[ids]
y = y[ids]

nb_validation_samples = int(validation_split * X.shape[0])
X_train = X[:-nb_validation_samples]
y_train = y[:-nb_validation_samples]
X_val = X[-nb_validation_samples:]
y_val = y[-nb_validation_samples:]


In [9]:
document = Input(shape=(max_sentences, maxlen), dtype='int64')
input_sentence = Input(shape=(maxlen,), dtype='int64')

embedded_layer = Lambda(binarize, output_shape=binarize_outshape)(input_sentence)

if has_dense:
    print('running model with fully connected layers')
    model = Model(inputs=document, outputs=sentence_encoder_with_dense(embedded_layer))
else:
    print('running model without fully connected layers')
    model = Model(inputs=document, outputs=sentence_encoder_wo_dense(embedded_layer))

model.summary()


running model with fully connected layers


In [None]:
checkpoint_path = None  # You can set path if you have a checkpoint to resume from
if checkpoint_path and os.path.exists(checkpoint_path):
    print("Loading checkpoint:", checkpoint_path)
    model.load_weights(checkpoint_path)

file_name = "sentence_model"
ckpt_cb = keras.callbacks.ModelCheckpoint('checkpoints/' + file_name + '.{epoch:02d}-{val_loss:.2f}.keras',
                                          monitor='val_loss', save_best_only=True, mode='min')

earlystop_cb = keras.callbacks.EarlyStopping(monitor='val_loss', patience=7, verbose=1)

loss_history = RecordLossHistory()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=100,
    epochs=10,
    shuffle=True,
    callbacks=[earlystop_cb, ckpt_cb, loss_history]
)


Epoch 1/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m646s[0m 3s/step - accuracy: 0.5444 - loss: 0.6771 - val_accuracy: 0.7946 - val_loss: 0.4523
Epoch 2/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m588s[0m 3s/step - accuracy: 0.8043 - loss: 0.4240 - val_accuracy: 0.8380 - val_loss: 0.3820
Epoch 3/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m594s[0m 3s/step - accuracy: 0.8804 - loss: 0.2906 - val_accuracy: 0.8328 - val_loss: 0.3847
Epoch 4/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m593s[0m 3s/step - accuracy: 0.9195 - loss: 0.2073 - val_accuracy: 0.8570 - val_loss: 0.3421
Epoch 5/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m592s[0m 3s/step - accuracy: 0.9467 - loss: 0.1422 - val_accuracy: 0.8648 - val_loss: 0.3521
Epoch 6/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m613s[0m 3s/step - accuracy: 0.9575 - loss: 0.1200 - val_accuracy: 0.8660 - val_loss: 0.4631
Epoch 7/10
[1m200/200