In [1]:
import pickle
from gensim.models import word2vec
import random
import re
import numpy as np

In [2]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, LSTM, Bidirectional, GlobalMaxPooling1D, Lambda, Add, Dot
from keras.models import Model
from keras import backend as K
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [3]:
def readFile(fname):
    f = open(fname, "rb")
    result = pickle.load(f)
    f.close()
    
    return result

In [4]:
def avgLength(q_doc, p_doc, n_doc):
    
    avg_q = 0
    avg_p = 0
    avg_n = 0
    
    for i in range(len(q_doc)):
        avg_q = avg_q + len(q_doc[i])
        avg_p = avg_p + len(p_doc[i])
        avg_n = avg_n + len(n_doc[i])
    
    print(avg_q / len(q_doc))
    print(avg_p / len(q_doc))
    print(avg_n / len(q_doc))


In [5]:
def tokenize(text):
    pattern = re.compile('[a-zA-Z]+')
    return (match.group(0) for match in pattern.finditer(text))

def makeTrainFeatuers(trec_train, vec_model, word_index):
    
    q_docs = []
    p_docs = []
    n_docs = []
    for e in trec_train:
        tmp_q_docs = []
        tmp_p_docs = []
        tmp_n_docs = []
        
        q = list(tokenize(e[0]))
        
        p = e[1]['p']
        p = p[random.randrange(len(p))]
        p = list(tokenize(p))
        
        n = e[1]['n']
        n = n[random.randrange(len(n))]
        n = list(tokenize(n))
        
        for token in q:
            if (token in vec_model.wv.vocab):
                
                tmp_q_docs.append(word_index[token])
                
        for token in p:
            if (token in vec_model.wv.vocab):
                tmp_p_docs.append(word_index[token])
                    
        for token in n:
            if (token in vec_model.wv.vocab):
                tmp_n_docs.append(word_index[token])  
                
        q_docs.append(tmp_q_docs)
        p_docs.append(tmp_p_docs)
        n_docs.append(tmp_n_docs)
        
    return q_docs, p_docs, n_docs    


In [6]:
def pad(q_docs, p_docs, n_docs):
    
    max_length = 40
    
    q_pad = pad_sequences(q_docs, maxlen=max_length, padding='post')
    p_pad = pad_sequences(p_docs, maxlen=max_length, padding='post')
    n_pad = pad_sequences(n_docs, maxlen=max_length, padding='post')
    
    return q_pad, p_pad, n_pad

In [7]:
def buildModel(vec_model):
    max_len = 40 #model的部份主要按照ANSWER SELECTION GITHUB 40是跟者上面的設定
    # Input layer
    input_q = Input((max_len,))
    input_p = Input((max_len,))
    input_n = Input((max_len,))
    
    # Embedding layer
    embedding_matrix = np.zeros((len(vec_model.wv.vocab) + 1, 250)) #250是WORD2VEC訓練的詞向量維度
    for word, i in word_index.items():
        embedding_vector = vec_model.wv[word]
        embedding_matrix[i] = embedding_vector
    embedding_layer = Embedding(len(vec_model.wv.vocab) + 1, 250, weights=[embedding_matrix], input_length=40)
    
    emb_q = embedding_layer(input_q)
    emb_p = embedding_layer(input_p)
    emb_n = embedding_layer(input_n)
    
    # Bi-lstm layer
    shared_lstm = Bidirectional(
        LSTM(300, return_sequences=True)#RETUREN SEQUENCE回傳所有時間點的隱藏曾的直
    )
    encode_q = shared_lstm(emb_q)
    encode_p = shared_lstm(emb_p)
    encode_n = shared_lstm(emb_n)
    
    # Max pooling layer
    vec_q = GlobalMaxPooling1D()(encode_q)
    vec_p = GlobalMaxPooling1D()(encode_p)
    vec_n = GlobalMaxPooling1D()(encode_n)
    
    cosine_p = Dot(axes=1, normalize=True)([vec_q, vec_p])
    cosine_n = Dot(axes=1, normalize=True)([vec_q, vec_n])
    
    neg = Lambda(lambda x: -x)
    sub = Add()([cosine_n, neg(cosine_p)])#這邊開始的LOSS 主要按照論文李的HINGE LOSS 可以看一下我有沒有寫錯
    
    def hinge(x):
        tmp = K.zeros(K.shape(x))
        return K.maximum(tmp, 0.1 + x) #錯誤的答案跟正確的至少可以相差0.2 再大就被視為一樣好 
    
    hinge1 = Lambda(hinge)
    result = hinge1(sub)
    
    model = Model(inputs=[input_q, input_p, input_n], outputs=[result, cosine_p, cosine_n])
    
    return model

In [8]:
trec_train = readFile("trec_train.pkl")
word_index = readFile("word_index_dic.pkl")
vec_model = word2vec.Word2Vec.load("word2vec.model")

q_docs, p_docs, n_docs = makeTrainFeatuers(trec_train, vec_model, word_index)
q_pad, p_pad, n_pad = pad(q_docs, p_docs, n_docs)

In [9]:
avgLength(q_docs, p_docs, n_docs)

3.8098106712564546
16.195352839931154
13.333046471600689


In [10]:
model = buildModel(vec_model)
model.compile(
    optimizer='adam', loss='mean_squared_error', loss_weights=[1, 0, 0]#只看HINGE LOSS 未來可以加入權重
)

In [11]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 40, 250)      3898000     input_1[0][0]                    
                                                                 input_2[0][0]                    
          

In [12]:
label1 = np.zeros((len(q_pad), 1)) # 這邊地一個是HINGE LOSS 地2個是cos true 3 is cos neg
label2 = np.ones((len(q_pad), 1)) # cos 正確的最大是1
label3 = np.ones((len(q_pad), 1))
label3 = label3 * -1 # cos錯誤的最好是-1

In [13]:
filepath = "weights_best_2.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, save_weights_only=False, mode='min')
callbacks_list = [checkpoint]

In [14]:
model.fit([q_pad, p_pad, n_pad], [label1, label2, label3], callbacks=callbacks_list, epochs=10, batch_size=256, verbose=1)

Epoch 1/10

Epoch 00001: loss improved from inf to 0.00690, saving model to weights_best_2.hdf5
Epoch 2/10

Epoch 00002: loss improved from 0.00690 to 0.00442, saving model to weights_best_2.hdf5
Epoch 3/10

Epoch 00003: loss improved from 0.00442 to 0.00283, saving model to weights_best_2.hdf5
Epoch 4/10

Epoch 00004: loss improved from 0.00283 to 0.00154, saving model to weights_best_2.hdf5
Epoch 5/10

Epoch 00005: loss improved from 0.00154 to 0.00071, saving model to weights_best_2.hdf5
Epoch 6/10

Epoch 00006: loss improved from 0.00071 to 0.00028, saving model to weights_best_2.hdf5
Epoch 7/10

Epoch 00007: loss improved from 0.00028 to 0.00011, saving model to weights_best_2.hdf5
Epoch 8/10

Epoch 00008: loss improved from 0.00011 to 0.00004, saving model to weights_best_2.hdf5
Epoch 9/10

Epoch 00009: loss improved from 0.00004 to 0.00002, saving model to weights_best_2.hdf5
Epoch 10/10

Epoch 00010: loss improved from 0.00002 to 0.00001, saving model to weights_best_2.hdf5


<keras.callbacks.History at 0x7fc570e57048>

In [15]:
model.load_weights(filepath="weights_best_2.hdf5" ) # 先把最好的MODEL載入在除存
model_json = model.to_json()
with open("model_margin_0.1_epo_10_2.json", "w") as json_file:
    json_file.write(model_json)