In [1]:
import numpy

from keras.layers import InputSpec, Layer, Input, Dense, merge
from keras.layers import Lambda, Activation, Dropout, Embedding, TimeDistributed
from keras.layers import Bidirectional, GRU, LSTM
from keras.layers.noise import GaussianNoise
from keras.layers.advanced_activations import ELU
import keras.backend as K
from keras.models import Sequential, Model, model_from_json
from keras.regularizers import l2
from keras.optimizers import Adam
from keras.layers.normalization import BatchNormalization
from keras.layers.pooling import GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import Merge


def build_model(vectors, shape, settings):
    '''Compile the model.'''
    max_length, nr_hidden, nr_class = shape
    # Declare inputs.
    ids1 = Input(shape=(max_length,), dtype='int32', name='words1')
    ids2 = Input(shape=(max_length,), dtype='int32', name='words2')

    # Construct operations, which we'll chain together.
    embed = _StaticEmbedding(vectors, max_length, nr_hidden, dropout=0.2, nr_tune=5000)
    if settings['gru_encode']:
        encode = _BiRNNEncoding(max_length, nr_hidden, dropout=settings['dropout'])
    attend = _Attention(max_length, nr_hidden, dropout=settings['dropout'])
    align = _SoftAlignment(max_length, nr_hidden)
    compare = _Comparison(max_length, nr_hidden, dropout=settings['dropout'])
    entail = _Entailment(nr_hidden, nr_class, dropout=settings['dropout'])

    # Declare the model as a computational graph.
    sent1 = embed(ids1) # Shape: (i, n)
    sent2 = embed(ids2) # Shape: (j, n)

    if settings['gru_encode']:
        sent1 = encode(sent1)
        sent2 = encode(sent2)

    attention = attend(sent1, sent2)  # Shape: (i, j)

    align1 = align(sent2, attention)
    align2 = align(sent1, attention, transpose=True)

    feats1 = compare(sent1, align1)
    feats2 = compare(sent2, align2)

    scores = entail(feats1, feats2)

    # Now that we have the input/output, we can construct the Model object...
    model = Model(input=[ids1, ids2], output=[scores])

    # ...Compile it...
    model.compile(
        optimizer=Adam(lr=settings['lr']),
        loss='categorical_crossentropy',
        metrics=['accuracy'])
    # ...And return it for training.
    return model


class _StaticEmbedding(object):
    def __init__(self, vectors, max_length, nr_out, nr_tune=1000, dropout=0.0):
        self.nr_out = nr_out
        self.max_length = max_length
        self.embed = Embedding(
                        vectors.shape[0],
                        vectors.shape[1],
                        input_length=max_length,
                        weights=[vectors],
                        name='embed',
                        trainable=False)
        self.tune = Embedding(
                        nr_tune,
                        nr_out,
                        input_length=max_length,
                        weights=None,
                        name='tune',
                        trainable=True,
                        dropout=dropout)
        self.mod_ids = Lambda(lambda sent: sent % (nr_tune-1)+1,
                              output_shape=(self.max_length,))

        self.project = TimeDistributed(
                            Dense(
                                nr_out,
                                activation=None,
                                bias=False,
                                name='project'))

    def __call__(self, sentence):
        def get_output_shape(shapes):
            print(shapes)
            return shapes[0]
        mod_sent = self.mod_ids(sentence)
        tuning = self.tune(mod_sent)
        #tuning = merge([tuning, mod_sent],
        #    mode=lambda AB: AB[0] * (K.clip(K.cast(AB[1], 'float32'), 0, 1)),
        #    output_shape=(self.max_length, self.nr_out))
        pretrained = self.project(self.embed(sentence))
        vectors = merge([pretrained, tuning], mode='sum')
        return vectors


class _BiRNNEncoding(object):
    def __init__(self, max_length, nr_out, dropout=0.0):
        self.model = Sequential()
        self.model.add(Bidirectional(LSTM(nr_out, return_sequences=True,
                                         dropout_W=dropout, dropout_U=dropout),
                                         input_shape=(max_length, nr_out)))
        self.model.add(TimeDistributed(Dense(nr_out, activation='relu', init='he_normal')))
        self.model.add(TimeDistributed(Dropout(0.2)))

    def __call__(self, sentence):
        return self.model(sentence)


class _Attention(object):
    def __init__(self, max_length, nr_hidden, dropout=0.0, L2=0.0, activation='relu'):
        self.max_length = max_length
        self.model = Sequential()
        self.model.add(Dropout(dropout, input_shape=(nr_hidden,)))
        self.model.add(
            Dense(nr_hidden, name='attend1',
                init='he_normal', W_regularizer=l2(L2),
                input_shape=(nr_hidden,), activation='relu'))
        self.model.add(Dropout(dropout))
        self.model.add(Dense(nr_hidden, name='attend2',
            init='he_normal', W_regularizer=l2(L2), activation='relu'))
        self.model = TimeDistributed(self.model)

    def __call__(self, sent1, sent2):
        def _outer(AB):
            att_ji = K.batch_dot(AB[1], K.permute_dimensions(AB[0], (0, 2, 1)))
            return K.permute_dimensions(att_ji,(0, 2, 1))
        return merge(
                [self.model(sent1), self.model(sent2)],
                mode=_outer,
                output_shape=(self.max_length, self.max_length))


class _SoftAlignment(object):
    def __init__(self, max_length, nr_hidden):
        self.max_length = max_length
        self.nr_hidden = nr_hidden

    def __call__(self, sentence, attention, transpose=False):
        def _normalize_attention(attmat):
            att = attmat[0]
            mat = attmat[1]
            if transpose:
                att = K.permute_dimensions(att,(0, 2, 1))
            # 3d softmax
            e = K.exp(att - K.max(att, axis=-1, keepdims=True))
            s = K.sum(e, axis=-1, keepdims=True)
            sm_att = e / s
            return K.batch_dot(sm_att, mat)
        return merge([attention, sentence], mode=_normalize_attention,
                      output_shape=(self.max_length, self.nr_hidden)) # Shape: (i, n)


class _Comparison(object):
    def __init__(self, words, nr_hidden, L2=0.0, dropout=0.0):
        self.words = words
        self.model = Sequential()
        self.model.add(Dropout(dropout, input_shape=(nr_hidden*2,)))
        self.model.add(Dense(nr_hidden, name='compare1',
            init='he_normal', W_regularizer=l2(L2)))
        self.model.add(Activation('relu'))
        self.model.add(Dropout(dropout))
        self.model.add(Dense(nr_hidden, name='compare2',
                        W_regularizer=l2(L2), init='he_normal'))
        self.model.add(Activation('relu'))
        self.model = TimeDistributed(self.model)

    def __call__(self, sent, align, **kwargs):
        result = self.model(merge([sent, align], mode='concat')) # Shape: (i, n)
#         avged = GlobalAveragePooling1D()(result, mask=self.words)
#         maxed = GlobalMaxPooling1D()(result, mask=self.words)
        avged = GlobalAveragePooling1D()(result)
        maxed = GlobalMaxPooling1D()(result)
        merged = merge([avged, maxed])
        result = BatchNormalization()(merged)
        return result


class _Entailment(object):
    def __init__(self, nr_hidden, nr_out, dropout=0.0, L2=0.0):
        self.model = Sequential()
        self.model.add(Dropout(dropout, input_shape=(nr_hidden*2,)))
        self.model.add(Dense(nr_hidden, name='entail1',
            init='he_normal', W_regularizer=l2(L2)))
        self.model.add(Activation('relu'))
        self.model.add(Dropout(dropout))
        self.model.add(Dense(nr_hidden, name='entail2',
            init='he_normal', W_regularizer=l2(L2)))
        self.model.add(Activation('relu'))
        self.model.add(Dense(nr_out, name='entail_out', activation='softmax',
                        W_regularizer=l2(L2), init='zero'))

    def __call__(self, feats1, feats2):
        features = merge([feats1, feats2], mode='concat')
        return self.model(features)


class _GlobalSumPooling1D(Layer):
    '''Global sum pooling operation for temporal data.
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.
    '''
    def __init__(self, **kwargs):
        super(_GlobalSumPooling1D, self).__init__(**kwargs)
        self.input_spec = [InputSpec(ndim=3)]

    def get_output_shape_for(self, input_shape):
        return (input_shape[0], input_shape[2])

    def call(self, x, mask=None):
        if mask is not None:
            return K.sum(x * K.clip(mask, 0, 1), axis=1)
        else:
            return K.sum(x, axis=1)


def test_build_model():
    vectors = numpy.ndarray((100, 8), dtype='float32')
    shape = (10, 16, 3)
    settings = {'lr': 0.001, 'dropout': 0.2, 'gru_encode':True}
    model = build_model(vectors, shape, settings)


def test_fit_model():

    def _generate_X(nr_example, length, nr_vector):
        X1 = numpy.ndarray((nr_example, length), dtype='int32')
        X1 *= X1 < nr_vector
        X1 *= 0 <= X1
        X2 = numpy.ndarray((nr_example, length), dtype='int32')
        X2 *= X2 < nr_vector
        X2 *= 0 <= X2
        return [X1, X2]

    def _generate_Y(nr_example, nr_class):
        ys = numpy.zeros((nr_example, nr_class), dtype='int32')
        for i in range(nr_example):
            ys[i, i % nr_class] = 1
        return ys

    vectors = numpy.ndarray((100, 8), dtype='float32')
    shape = (10, 16, 3)
    settings = {'lr': 0.001, 'dropout': 0.2, 'gru_encode':True}
    model = build_model(vectors, shape, settings)

    train_X = _generate_X(20, shape[0], vectors.shape[0])
    train_Y = _generate_Y(20, shape[2])
    dev_X = _generate_X(15, shape[0], vectors.shape[0])
    dev_Y = _generate_Y(15, shape[2])
    print(train_X.shape)
    print(train_Y.shape)
    model.fit(train_X, train_Y, validation_data=(dev_X, dev_Y), nb_epoch=5,
              batch_size=4)


__all__ = [build_model]

Using TensorFlow backend.


In [2]:
########################################
## import packages
########################################
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd
import six.moves.cPickle
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

# import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')
########################################
## set directories and parameters
########################################
BASE_DIR = './'
EMBEDDING_FILE = '/root/qa/data/GoogleNews-vectors-negative300.bin'

MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1


########################################
## index word vectors
########################################
print('Indexing word vectors')

word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, \
        binary=True)
print('Found %s word vectors of word2vec' % len(word2vec.vocab))

########################################
## process texts in datasets
########################################
print('Processing text dataset')


# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
def text_to_wordlist(text, remove_stopwords=True, stem_words=True):
    # Clean the text, with the option to remove stopwords and to stem words.

    # Convert words to lower case and split them
    text = text.lower().split()
    #     print( 'text  1 ',text)
    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]

    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    # text = text.lower().split()
    # Return a list of words
    return (text)
 
############prepare  training/validation data  /only use mme
train_processed_query_dataset_path = '/root/qa/data/faq_query_pairs609.csv'
train_processed_query_df = pd.read_csv(train_processed_query_dataset_path)
print("processed train data set shape",train_processed_query_df.shape)
train_texts_mme_1 = list(train_processed_query_df.loc[:,'query'].map(lambda x: text_to_wordlist(x,remove_stopwords=False, stem_words=True) ))
train_texts_mme_2 = list(train_processed_query_df.loc[:,'faq'].map(lambda x: text_to_wordlist(x,remove_stopwords=False, stem_words=True) ))
train_labels_mme   = list(train_processed_query_df.loc[:,'match'])
###########add the pair of (query1,query1)  sentence as positive samples
new_positive = list(set(train_texts_mme_1))+ list(set(train_texts_mme_2))
print('the number of set of faq and query as positive ',len(new_positive))
train_texts_mme_add_1 = train_texts_mme_1 + new_positive
train_texts_mme_add_2 = train_texts_mme_2 + new_positive
train_labels_mme_add  = train_labels_mme  + [1]* len(new_positive)
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(train_texts_mme_1 + train_texts_mme_2 )
tokenizer_fname = 'tokenizer1'
six.moves.cPickle.dump(tokenizer, open(os.path.join('./result/', tokenizer_fname), "wb"))
# tokenizer = six.moves.cPickle.load(open(os.path.join('./model/', tokenizer_fname), 'rb'))
train_sequences_mme_add_1 = tokenizer.texts_to_sequences(train_texts_mme_add_1)
train_sequences_mme_add_2 = tokenizer.texts_to_sequences(train_texts_mme_add_2)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))
train_data_mme_add_1 = pad_sequences(train_sequences_mme_add_1, maxlen=MAX_SEQUENCE_LENGTH)
train_data_mme_add_2 = pad_sequences(train_sequences_mme_add_2, maxlen=MAX_SEQUENCE_LENGTH)
train_labels_mme_add = np.array(train_labels_mme_add)
print('Shape of train_data tensor:', train_data_mme_add_1.shape)
print('Shape of train_label tensor:', train_labels_mme_add.shape)


########################################
## prepare embeddings
########################################
print('Preparing embedding matrix')

# tokenizer = six.moves.cPickle.load(open(os.path.join('./model/', tokenizer_fname), 'rb'))
# word_index = tokenizer.word_index
nb_words = min(MAX_NB_WORDS, len(word_index))+1
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)

print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
embedding_matrix_path = './result/embedding_matrix.npy'
np.save(embedding_matrix_path,embedding_matrix)

# ######################################
# sample train/validation data
# #######################################
np.random.seed(1234)
perm_mme_add = np.random.permutation(len(train_data_mme_add_1))
idx_train_mme_add = perm_mme_add[:int(len(train_data_mme_add_1)*(1-VALIDATION_SPLIT))]
idx_val_mme_add =  perm_mme_add[int(len(train_data_mme_add_1)*(1-VALIDATION_SPLIT)):]
print('idx_train len', len(idx_train_mme_add))
print('idx_val  len',len(idx_val_mme_add))

data_train_mme_add_1  = np.vstack((train_data_mme_add_1[idx_train_mme_add], train_data_mme_add_2[idx_train_mme_add]))
data_train_mme_add_2  = np.vstack((train_data_mme_add_2[idx_train_mme_add], train_data_mme_add_1[idx_train_mme_add]))
labels_train_mme_add  = np.concatenate((train_labels_mme_add[idx_train_mme_add], train_labels_mme_add[idx_train_mme_add]))



print("data_train_mme_add_1 lenth", data_train_mme_add_1.shape[0])
data_val_mme_add_1  = np.vstack((train_data_mme_add_1[idx_val_mme_add], train_data_mme_add_2[idx_val_mme_add]))
data_val_mme_add_2  = np.vstack((train_data_mme_add_2[idx_val_mme_add], train_data_mme_add_1[idx_val_mme_add]))
labels_val_mme_add  = np.concatenate((train_labels_mme_add[idx_val_mme_add], train_labels_mme_add[idx_val_mme_add]))
print("labels_val_mme_add length", labels_val_mme_add.shape[0])
weight_val = np.ones(len(labels_val_mme_add))


Indexing word vectors
Found 3000000 word vectors of word2vec
Processing text dataset
processed train data set shape (154686, 5)
the number of set of faq and query as positive  861
Found 634 unique tokens
Shape of train_data tensor: (155547, 30)
Shape of train_label tensor: (155547,)
Preparing embedding matrix
Null word embeddings: 128
idx_train len 139992
idx_val  len 15555
data_train_mme_add_1 lenth 279984
labels_val_mme_add length 31110


In [3]:
vectors = embedding_matrix
shape = (MAX_SEQUENCE_LENGTH, 126, 2)
# max_length, nr_hidden, nr_class = shape
settings = {'lr': 0.001, 'dropout': 0.2, 'gru_encode':True}
model = build_model(vectors, shape, settings)
model.compile(loss='categorical_crossentropy',
              optimizer='nadam',
              metrics=['acc'])
STAMP_MME='decomposible_attention'
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
bst_model_path = STAMP_MME + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)
from keras.utils.np_utils import to_categorical
hist = model.fit([data_train_mme_add_1, data_train_mme_add_2],  to_categorical(labels_train_mme_add), \
                 validation_data=([data_val_mme_add_1, data_val_mme_add_2], to_categorical(labels_val_mme_add) ), \
                 epochs=200, batch_size=2048, shuffle=True, \
                callbacks=[early_stopping, model_checkpoint])
# train_X = [data_train_mme_add_1, data_train_mme_add_2]
# train_Y = to_categorical(labels_train_mme_add)
# dev_X = [data_val_mme_add_1, data_val_mme_add_2]
# dev_Y = to_categorical(labels_val_mme_add)

# model.fit(train_X, train_Y, validation_data=(dev_X, dev_Y), nb_epoch=5,
#               batch_size=4)
TEST_DATA_FILE2 = '/root/qa/data/faq_query_pairs74.csv'
pre_processed_query_df = pd.read_csv(TEST_DATA_FILE2)
pre_texts_1 = list(pre_processed_query_df.loc[:,'query'].map(lambda x: text_to_wordlist(x,remove_stopwords=True, stem_words=True) ))
pre_texts_2 = list(pre_processed_query_df.loc[:,'faq'].map(lambda x: text_to_wordlist(x,remove_stopwords=True, stem_words=True) ))

# print(test_texts_1)
pre_sequences_1 = tokenizer.texts_to_sequences(pre_texts_1)
pre_sequences_2 = tokenizer.texts_to_sequences(pre_texts_2)
print(pre_sequences_1[0])


pre_data_1 = pad_sequences(pre_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
pre_data_2 = pad_sequences(pre_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', pre_data_1.shape)
############compute socre for every pair of faq and query
import time
t1 = time.time()
preds1 = model.predict([pre_data_1, pre_data_2], batch_size=8192, verbose=1)
preds2= model.predict([pre_data_2, pre_data_1], batch_size=8192, verbose=1)


  name=name)


Train on 279984 samples, validate on 31110 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
[33, 13, 52, 58, 73]
Shape of data tensor: (18796, 30)


In [4]:
print(preds1[1:5,:])

[[  8.96519065e-01   1.03480943e-01]
 [  8.95811260e-01   1.04188658e-01]
 [  9.99914646e-01   8.53987731e-05]
 [  9.99911904e-01   8.81194646e-05]]


In [10]:
preds =preds1[:,1]
pre_processed_query_df.loc[:,'score'] =preds
pre_processed_query_df.to_csv("./result/result_query_df_train_mme600.csv")

anser_index = pre_processed_query_df.groupby('query').apply(lambda subf: subf.loc[:,'score'].argmax())
anser_index2 = list(anser_index)
currectnum = pre_processed_query_df.loc[list(anser_index),'match'].sum()
t2 = time.time()
print(list(anser_index))
print('currectnum------------------------',currectnum)
print(pre_processed_query_df.loc[list(anser_index),:])
print("time of prediction for each query",(t2-t1)/len(anser_index2))

[16829, 11112, 16541, 250, 10518, 17020, 749, 15651, 13145, 14474, 13119, 8558, 326, 12922, 13170, 9913, 7488, 13122, 13120, 10519, 926, 1261, 1595, 16560, 7394, 1421, 13148, 1095, 348, 345, 13360, 10940, 11383, 18782, 10499, 701, 16520, 8933, 17022, 1263, 7396, 17422, 6467, 13658, 6102, 249, 615, 6111, 4473, 13587, 12893, 12690, 13340, 16212, 16214, 347, 131, 17021, 13124, 1262, 10932, 18786, 7685, 7393, 7392, 6359, 10977, 352, 3607, 3713, 7621, 16521, 16213, 13558]
currectnum------------------------ 21.0
       Unnamed: 0  faq_id                                                faq  \
16829       16829   228.0  Is it possible to start using the WBSe again i...   
11112       11112   151.0                MME Guide for Commercial Directors    
16541       16541   224.0  Error saving forecast - An error occurred in S...   
250           250     4.0  How do I manage which resources receive the Su...   
10518       10518   143.0                   Profit and Cost Center Activity    
17020   