In [1]:
"""This script is to test whether integrating lda works or not
Thus, this model is to start with the simplest model.
"""
import numpy as np
import pickle
from collections import Counter
from gensim.corpora import Dictionary
from utils import lda_helper, data_helper, model_helper

import keras
from keras.layers import Input, Dense, Dropout, GRU, Bidirectional
from keras.models import Model

from sklearn.metrics import classification_report

# parameters for initial configuration
config = data_helper.BaseConfiguration()
config.MAX_SENTS = 10
from keras.layers import BatchNormalization

Using TensorFlow backend.
  'If you are using Attention model, plz use Theano as the backend of keras. Because the dot function of Tensorflow backend does not work.')


In [2]:
print('preparing dataset')
"""preprare dataset """
dataset_path = './preprocessed_data/dataset.pkl'
data_whole = pickle.load(open(dataset_path, 'rb'))
labels = data_whole['labels']
codes = data_whole['codes']

processed_dataset = data_whole['dataset_processed']
processed_contexts = data_whole['context_processed']
processed_labels = data_whole['labels_preprocessed']
code_targets = data_whole['codes_preprocessed']
train_indices = data_whole['train_indices']
valid_indices = data_whole['valid_indices']
test_indices = data_whole['test_indices']
keras_tokenizer = data_whole['keras_tokenizer']
labels_tokenizer = data_whole['labels_tokenizer']

# lda_tokenizer = Dictionary.load('./preprocessed_data/lda/lda_dict.pkl')
# lda_model = lda_helper.load_lda('./preprocessed_data/lda/lda.model')
# ctt_lda_idx = lda_helper.doc2idx(lda_tokenizer, data_whole['dataset_raw'], config.seq_max_len)
# ctx_lda_idx = lda_helper.doc2idx(lda_tokenizer, data_whole['context_raw'], config.seq_max_len)
# convert doc_raw to bags of words
# doc_topics_ctt = lda_helper.doc2topics(lda_tokenizer, lda_model, data_whole['dataset_raw'])
ctt_lda_idx = data_whole['ctt_lda_idx']
ctx_lda_idx = data_whole['ctx_lda_idx']

# get training data
x_train = processed_dataset[train_indices]
x_valid = processed_dataset[valid_indices]
x_test = processed_dataset[test_indices]

ctt_lda_train = ctt_lda_idx[train_indices]
ctt_lda_valid = ctt_lda_idx[valid_indices]
ctt_lda_test = ctt_lda_idx[test_indices]

ctx_train = processed_contexts[train_indices]
ctx_valid = processed_contexts[valid_indices]
ctx_test = processed_contexts[test_indices]

ctx_lda_train = ctx_lda_idx[train_indices]
ctx_lda_valid = ctx_lda_idx[valid_indices]
ctx_lda_test = ctx_lda_idx[test_indices]

prev10_train = processed_labels[train_indices]
prev10_valid = processed_labels[valid_indices]
prev10_test = processed_labels[test_indices]

# get labels
code_train = code_targets[train_indices]
code_valid = code_targets[valid_indices]
code_test = code_targets[test_indices]

print('Train Length: ' + str(len(train_indices)))
print('Valid Length: ' + str(len(valid_indices)))
print('Test Length: ' + str(len(test_indices)))
print('Overall MI Code Distribution: ' + str(Counter(codes)))
print('Training MI Code Distribution: ' + str(Counter(codes[train_indices])))
print('Validation MI Code Distribution: ' + str(Counter(codes[valid_indices])))
print('Testing MI Code Distribution: ' + str(Counter(codes[test_indices])))

preparing dataset
Train Length: 17952
Valid Length: 2244
Test Length: 2244
Overall MI Code Distribution: Counter({0: 10739, 1: 7796, -1: 3906})
Training MI Code Distribution: Counter({0: 8575, 1: 6246, -1: 3131})
Validation MI Code Distribution: Counter({0: 1069, 1: 787, -1: 388})
Testing MI Code Distribution: Counter({0: 1095, 1: 762, -1: 387})


In [3]:
"""
Construct embedding layers
"""
print('Initialize weights')
if not os.path.exists('./weights/w2v.npy'):
    # load pretrained embeddings
    w2v_path = './preprocessed_data/w2v_corpus/google.bin'
    if 'glove' in w2v_path:
        w2v_model = data_helper.load_glove(w2v_path)
    else:
        w2v_model = data_helper.load_word2vec(w2v_path)
    config.embedding_size = len(w2v_model['the'])
    # initialize weights
    embd_weights = model_helper.init_weights(w2v_model, keras_tokenizer, config.embedding_size)
    np.save('./weights/w2v.npy', embd_weights)
else:
    embd_weights = np.load('./weights/w2v.npy')

if not os.path.exists('./weights/lda.npy'):
    lda_weights = lda_helper.init_weight('./preprocessed_data/lda/lda.model', 20)
    np.save('./weights/lda.npy', lda_weights)
else:
    lda_weights = np.load('./weights/lda.npy')

if not os.path.exists('./weights/code_prevs.npy'):
    import gensim
    label2vec_model = gensim.models.Word2Vec.load('./preprocessed_data/w2v_corpus/w2v_codes_50.txt')
    code_weights = np.zeros((len(labels_tokenizer.word_index) + 1, 50))
    for label_tmp, i in labels_tokenizer.word_index.items():
        # if word is found in the model, will be zeros
        if label_tmp in label2vec_model.wv:
            code_weights[i] = label2vec_model.wv.get_vector(label_tmp)
    np.save('./weights/code_prevs.npy', code_weights)
else:
    code_weights = np.load('./weights/code_prevs.npy')

ctt_embedding = model_helper.build_embedding(embd_weights, config.seq_max_len, name='ctt_embed')
# initial weights for ctx embeddings
ctx_embedding = model_helper.build_embedding(embd_weights, config.seq_max_len, name='ctx_embed')

lda_weights = lda_helper.init_weight('./preprocessed_data/lda/lda.model', 20)
lda_ctt_embed = lda_helper.build_embedding(lda_weights, config.seq_max_len, name='ctt_lda_embed')
lda_ctx_embed = lda_helper.build_embedding(lda_weights, config.seq_max_len, name='ctx_lda_embed')

code_embedding = model_helper.build_embedding(embd_weights, 10, name='code_embed')

In [None]:
"""CTT"""
# define layers for ctt
ctt_input = Input(shape=(config.seq_max_len,), dtype='int32', name='ctt_input')
ctt_embed = ctt_embedding(ctt_input)
ctt_lstm = Bidirectional(GRU(100, recurrent_dropout=0.1, kernel_initializer="glorot_uniform", recurrent_activation='tanh'),
                                name='ctt_bilstm')(ctt_embed)
ctt_dropout = Dropout(0.2)(ctt_lstm)

# define layers for topic-word
lda_input_ctt = Input(shape=(config.seq_max_len,), dtype='int32', name='ctt_lda_input')
lda_embed_ctt = lda_ctt_embed(lda_input_ctt)
lda_embed_norm_ctt = BatchNormalization()(lda_embed_ctt)
lda_lstm_ctt = Bidirectional(GRU(20, recurrent_dropout=0.1, kernel_initializer="glorot_uniform", recurrent_activation='tanh'),
                                name='lda_bilstm_ctt')(lda_embed_norm_ctt)
lda_dropout_ctt = Dropout(0.2)(lda_lstm_ctt)

merged_vector_ctt = keras.layers.concatenate([ctt_dropout, lda_dropout_ctt], axis=-1)
merged_dense_ctt = Dense(100, activation='relu',
                     kernel_initializer="glorot_uniform", name='merged_dense_ctt')(merged_vector_ctt)

"""CTX"""
# define layers for ctx
ctx_input = Input(shape=(config.seq_max_len,), dtype='int32', name='ctx_input')
ctx_embed = ctx_embedding(ctx_input)
ctx_lstm = Bidirectional(GRU(100, recurrent_dropout=0.1, kernel_initializer="glorot_uniform", recurrent_activation='tanh'),
                                name='ctx_bilstm')(ctx_embed)
ctx_dropout = Dropout(0.2)(ctx_lstm)

# define layers for topic-word
lda_input_ctx = Input(shape=(config.seq_max_len,), dtype='int32', name='ctx_lda_input')
lda_embed_ctx = lda_ctx_embed(lda_input_ctx)
lda_embed_norm_ctx = BatchNormalization()(lda_embed_ctx)
lda_lstm_ctx = Bidirectional(GRU(20, recurrent_dropout=0.1, kernel_initializer="glorot_uniform", recurrent_activation='tanh'),
                                name='lda_bilstm_ctx')(lda_embed_norm_ctx)
lda_dropout_ctx = Dropout(0.2)(lda_lstm_ctx)

merged_vector_ctx = keras.layers.concatenate([ctx_dropout, lda_dropout_ctx], axis=-1)
merged_dense_ctx = Dense(100, activation='relu',
                     kernel_initializer="glorot_uniform", name='merged_dense_ctx')(merged_vector_ctx)

"""
Codes
"""
code_input = Input(shape=(10,), dtype='int32', name='code10_input')
code_embed = code_embedding(code_input)
code_lstm = Bidirectional(LSTM(lstm_num, dropout=dp_rate), name='code_lstm')(code_embed)

"""Merge CTT and CTX"""
final_merged = keras.layers.concatenate([merged_dense_ctx, merged_dense_ctt, code_lstm], axis=-1)
last_drop = Dropout(0.1)(final_merged)
predictions = Dense(3, activation='sigmoid', name='final_output')(last_drop)

test_model = Model(inputs=[ctt_input, lda_input_ctt, ctx_input, lda_input_ctx, code_input], outputs=predictions)
# official document shows RMSprop is a better choice for recurrent neural network
test_model.compile(loss='categorical_crossentropy', optimizer= 'rmsprop',
            metrics=['accuracy'])
print(test_model.summary())

hist = test_model.fit([x_train, ctt_lda_train, ctx_train, ctx_lda_train, prev10_train], code_train,
            epochs=10,
            batch_size=64,
            validation_data=([x_valid, ctt_lda_valid, ctx_valid, ctx_lda_valid, prev10_valid], code_valid),
            class_weight='auto')


y_pred = test_model.predict([x_test, ctt_lda_test, ctx_test, ctx_lda_test, prev10_test]);print()
report = classification_report([np.argmax(item) for item in y_pred], [np.argmax(item) for item in code_test])
print(report)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
ctx_lda_input (InputLayer)       (None, 100)           0                                            
____________________________________________________________________________________________________
ctt_lda_input (InputLayer)       (None, 100)           0                                            
____________________________________________________________________________________________________
ctx_input (InputLayer)           (None, 100)           0                                            
____________________________________________________________________________________________________
ctx_lda_embed (Embedding)        (None, 100, 20)       167400      ctx_lda_input[0][0]              
___________________________________________________________________________________________