In [1]:
import keras
from keras.layers import *
from keras.models import Model
from keras.optimizers import Adam

from keras.preprocessing.text import Tokenizer
import keras.preprocessing.sequence as S
from keras.utils import to_categorical
from keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense
 
import json
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
 
query_max_len = 5
doc_max_len = 1600
batch_size = 512
vocab_size = 133165
INIT_LR = 1e-2
EPOCHS = 60

In [4]:
class SentimentLSTM:
    def __init__(self):
        self.tokenizer = Tokenizer(num_words=vocab_size)
        self.stop_words = []
        self.model = None
 
    def load_cuted_corpus(self, dir, input):
        #f = open(dir + '/' + input , 'r')
        data = pd.read_csv(dir + '/' + input, header=None)
        data = np.array(data)
        idxs = list(range(len(data)))
        np.random.shuffle(idxs)
#         lines = f.readlines()
        query_texts = []
        doc_texts = []
        labels = []
        for i in idxs:
            fields = data[i]
            rate = int(fields[3])
            cont_1 = fields[1]
            cont_2 = fields[2]
            cont_1 = " ".join(cont_1)
            cont_2 = " ".join(cont_2)
            query_texts.append(cont_1)
            doc_texts.append(cont_2)
            labels.append(rate)
 
        self.tokenizer.fit_on_texts(query_texts)
        self.tokenizer.fit_on_texts(doc_texts)
#         f.close()
        return query_texts, doc_texts, labels
 
    def load_data(self):
        x_1, x_2, y = self.load_cuted_corpus('corpus', 'data.csv')
        x_1 = self.tokenizer.texts_to_sequences(x_1)
        x_2 = self.tokenizer.texts_to_sequences(x_2)
        x_1 = S.pad_sequences(x_1,maxlen=query_max_len)
        x_2 = S.pad_sequences(x_2,maxlen=doc_max_len)
        y = to_categorical(y,num_classes=2)
        return ((x_1[0:55000], x_2[0:55000], y[0:55000]), (x_1[55000:], x_2[55000:], y[55000:]))
 
    def train(self):
        print('building model ...')
        self.model = SentimentLSTM.build_model()
 
        print('loading data ...')
        (query_train, doc_train, rate_train), (query_test, doc_test, rate_test) = self.load_data()
 
        print('training model ...')
        history = self.model.fit([query_train,doc_train], rate_train, batch_size=batch_size, epochs=EPOCHS)
        #self.model.save('model/keras.model')
        score = self.model.evaluate([query_test,doc_test],rate_test)
        print(score)
        print(SentimentLSTM.draw_acc_loss(history))
 
    def load_trained_model(self,path):
        model = SentimentLSTM.build_model()
        model.load_weights(path)
        return model
    
    def draw_acc_loss(history):
        acc = history.history['accuracy']
        loss = history.history['loss']
        epochs = range(1, len(acc) + 1)
        plt.title('Accuracy and Loss')
        plt.plot(epochs, acc, 'red', label='Training acc')
        plt.plot(epochs, loss, 'blue', label='Validation loss')
        plt.legend()
        plt.show()
 
    @staticmethod
    def build_model():
#         model = Sequential()
#         model.add(Embedding(vocab_size, 256, input_length=sentence_max_len))
#         model.add(Bidirectional(LSTM(128,implementation=2)))
#         model.add(Dropout(0.5))
#         model.add(Dense(2, activation='relu'))
#         model.compile('RMSprop', 'categorical_crossentropy', metrics=['accuracy'])
        query_input = keras.Input(shape=(None,), name="query")
        doc_input = keras.Input(shape=(None,), name="doc")
        query_features = Embedding(vocab_size, 32)(query_input)
        doc_features = Embedding(vocab_size, 64)(doc_input)
        query_features = Bidirectional(LSTM(64))(query_features)
        doc_features = Bidirectional(LSTM(128))(doc_features)
        merged = keras.layers.concatenate([query_features, doc_features])
        merged = Dense(128, activation='relu')(merged)
        merged = Dense(64, activation='relu')(merged)
        p = Dense(2, activation='sigmoid')(merged)
        
        model = Model([query_input, doc_input], p)
        model.compile(loss='binary_crossentropy', optimizer=Adam(lr=INIT_LR, decay=INIT_LR / EPOCHS), metrics=['accuracy'])
#         model.compile(loss='binary_crossentropy', optimizer=Adam(lr=INIT_LR, decay=INIT_LR / EPOCHS), metrics=[f1])
        print(model.summary())
        
        return model


In [None]:
def main():
    lstm = SentimentLSTM()
    lstm.train()
 
if __name__=="__main__":
    main()

building model ...
Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
query (InputLayer)              [(None, None)]       0                                            
__________________________________________________________________________________________________
doc (InputLayer)                [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 32)     4261280     query[0][0]                      
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 64)     8522560     doc[0][0]                        
____________________________________________________________________

Epoch 55/60