In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import os
import pandas as pd
import re
import keras.layers as layers

from collections import Counter
from collections import defaultdict
from keras import backend as K
from keras.callbacks import TensorBoard
from keras.layers import Input, Embedding, BatchNormalization, LSTM, Dense, Concatenate
from keras.models import Model

from keras.preprocessing.text import Tokenizer,  text_to_word_sequence

from keras.utils import plot_model
import os

import html
from functools import reduce

Using TensorFlow backend.


In [2]:
DATA_PATH='data/'
os.makedirs(DATA_PATH, exist_ok=True)

#source input files path
INPUT_PATH='data/kaggle-imdb/'
os.makedirs(INPUT_PATH, exist_ok=True)

#path containing tuned models for specific data.
LM_PATH='data/model/lm'
os.makedirs(LM_PATH, exist_ok=True)

BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag

In [3]:
max_vocab=200000

In [4]:
def buildVocabulary(texts):
    tokens = getTokens(texts)
    freq = Counter(o for o in tokens)
    freq.most_common(25)
    max_vocab = 60000
    min_freq = 2

    itos = [o for o,c in freq.most_common(max_vocab) if c>min_freq]
    itos.insert(0, '_unk_')
    itos.insert(0, '_pad_')

    stoi = defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
    return (itos, stoi)

In [5]:
df_trn = pd.read_csv(INPUT_PATH + '/train.tsv', sep='\t', header=None)
df_val = pd.read_csv(INPUT_PATH + '/validation.tsv', sep='\t', header=None)
df_test = pd.read_csv(INPUT_PATH + '/test.tsv', sep='\t', header=None)

In [6]:
# df_trn = pd.read_csv(INPUT_PATH + '/train.csv',  header=None)
# df_val = pd.read_csv(INPUT_PATH + '/validation.csv',  header=None)
# df_test = pd.read_csv(INPUT_PATH + '/test_submit.csv', header=None)

In [7]:
chunksize=24000
re1 = re.compile(r'  +')

def fixup(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x)).lower()


def get_texts_and_labels(df, n_lbls=1):
    labels = df.iloc[:,range(n_lbls)].values.astype(np.int64)
    texts = f'\n{BOS} {FLD} 1 ' + df[n_lbls].astype(str)
    texts = list(texts.apply(fixup).values)
    return texts, list(labels)

def get_texts(df, colNo):
    texts = f'\n{BOS} {FLD} 1 ' + df[colNo].astype(str)
    texts = list(texts.apply(fixup).values)
    return texts

def getTokens(texts):
    return text_to_word_sequence(" ".join(texts))

In [8]:
(train_texts, train_labels) = get_texts_and_labels(df_trn)
(val_texts, val_labels) = get_texts_and_labels(df_val)
(test_texts) = get_texts(df_test, 1)
(itos, stoi) = buildVocabulary(train_texts + val_texts + test_texts)

In [9]:
MAX_WORDS = 100
def getPaddedTokens(texts):
    fullTokens = []
    for text in texts:
        tokens =getTokens([text])
        if (len(tokens) > MAX_WORDS):
            tokens = tokens[0:MAX_WORDS]
        tokenIds = np.array([vocabulary[token] for token in tokens])
        #print(f'len(tokenIds):{len(tokenIds)}')
        if (len(tokenIds) < MAX_WORDS):
            tokenIds = np.pad(tokenIds, (0, MAX_WORDS - len(tokenIds)), 'constant', constant_values=(0, 1))
        fullTokens.append(tokenIds)
    return np.array(fullTokens)

In [10]:
vocabulary = stoi

In [11]:
# Reduce TensorFlow logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

# Instantiate the elmo model
elmo_module = hub.Module("https://tfhub.dev/google/elmo/1", trainable=False)

# Initialize session
sess = tf.Session()
K.set_session(sess)

K.set_learning_phase(1)

sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

In [12]:
# mini-batch generator
def generateELMOText(shuffled_data, batch_num, batch_size, data_size):
    start_index = batch_num * batch_size
    end_index = min((batch_num + 1) * batch_size, data_size)
    X_voc = getPaddedTokens(shuffled_data[start_index: end_index])
    sentence_split_list = []
    sentence_split_length_list = []
    for sentence in shuffled_data[start_index: end_index]:
        #print(f'sentence:{sentence}')
        sentence_split = sentence.split()
        sentence_split_length = len(sentence_split)
        if(sentence_split_length > MAX_WORDS):
            sentence_split = sentence_split[0:MAX_WORDS]
            sentence_split_length = MAX_WORDS
        #print(f'sentence_split_length:{sentence_split_length}')
        sentence_split += ["NaN"] * (MAX_WORDS
                                     - sentence_split_length)
        sentence_split_list.append((" ").join(sentence_split))
        sentence_split_length_list.append(sentence_split_length)    
    X_elmo = np.array(sentence_split_list)
    return (X_voc, X_elmo)

def generateELMOLabels(shuffled_labels, batch_num, batch_size, data_size):
    start_index = batch_num * batch_size
    end_index = min((batch_num + 1) * batch_size, data_size)
    return shuffled_labels[start_index: end_index]

        
def batch_iter(data, labels, batch_size, shuffle=True):
    num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1
    def data_generator():
        data_size = len(data)
        #print(f'data_size: {data_size}')

        while True:
            # Shuffle the data at each epoch
            if shuffle:
                shuffle_indices = np.random.permutation(np.arange(data_size))
                shuffled_data = data[shuffle_indices]
                shuffled_labels = labels[shuffle_indices]
            else:
                shuffled_data = data
                shuffled_labels = labels

            for batch_num in range(num_batches_per_epoch):
                (X_voc, X_elmo) = generateELMOText(shuffled_data, batch_num, batch_size, data_size)
                X = [X_voc, X_elmo]
                y = generateELMOLabels(shuffled_labels, batch_num, batch_size, data_size)
                yield X, y
    return num_batches_per_epoch, data_generator()

In [13]:
# mini-batch generator
def batch_test_iter(data, batch_size, shuffle=True):
    num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1
    def data_test_generator():
        data_size = len(data)

        while True:
            # Shuffle the data at each epoch
            if shuffle:
                shuffle_indices = np.random.permutation(np.arange(data_size))
                shuffled_data = data[shuffle_indices]
            else:
                shuffled_data = data

            for batch_num in range(num_batches_per_epoch):
                (X_voc, X_elmo) = generateELMOText(shuffled_data, batch_num, batch_size, data_size)                
                X = [X_voc, X_elmo]
                yield X
    return num_batches_per_epoch, data_test_generator()

In [14]:
def make_elmo_embedding(x):
    embeddings = elmo_module(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["elmo"]
    return embeddings

In [15]:
# elmo embedding dimension
elmo_dim = 1024

# Input Layers
word_input = Input(shape=(None, ), dtype='int32')  # (batch_size, sent_length)
elmo_input = Input(shape=(None, ), dtype=tf.string)  # (batch_size, sent_length, elmo_size)

# Hidden Layers
word_embedding = Embedding(input_dim=len(vocabulary), output_dim=128, mask_zero=True)(word_input)
elmo_embedding = layers.Lambda(make_elmo_embedding, output_shape=(None, elmo_dim))(elmo_input)
word_embedding = Concatenate()([word_embedding, elmo_embedding])
word_embedding = BatchNormalization()(word_embedding)
x = LSTM(128, dropout=0.2, recurrent_dropout=0.2)(word_embedding)

# Output Layer
predict = Dense(units=1, activation='sigmoid')(x)


model = Model(inputs=[word_input, elmo_input], outputs=predict)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 128)    7206400     input_1[0][0]                    
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, None, 1024)   0           input_2[0][0]                    
__________________________________________________________________________________________________
concatenat

In [16]:
batch_size = 32
train_steps, train_batches = batch_iter(df_trn[1], df_trn[0], batch_size,shuffle=False)
val_steps, val_batches = batch_iter(df_val[1], df_val[0], batch_size,shuffle=False)

In [17]:
train_steps

563

In [None]:
logfile_path = './log'
tb_cb = TensorBoard(log_dir=logfile_path, histogram_freq=0)

history = model.fit_generator(train_batches, train_steps,
                              epochs=5, 
                              validation_data=val_batches,
                              validation_steps=val_steps,
                              callbacks=[tb_cb])

In [19]:
test_steps, test_batches = batch_test_iter(df_test[1], batch_size, shuffle=False)
preds = model.predict_generator(test_batches, steps=test_steps)

In [20]:
def generateSubmission(df, predictValues, csvFile):
    predictValues = np.where(predictValues > 0.5,1, 0)
    result_df = pd.DataFrame(columns = ['id', 'label'])
    result_df['id'] = df[0]
    result_df['label'] = predictValues
    result_df.to_csv(csvFile, index=False)

In [22]:
generateSubmission(df_test, preds, 'submission3.csv')

In [None]:
plot_model(model, to_file='elmo.png')