In [25]:
from __future__ import division
from __future__ import absolute_import
from __future__ import print_function
import tensorflow as tf 
from keras.engine.topology import Layer
from keras.layers import Dense, Input, Flatten, Dropout, BatchNormalization, Add
from keras.layers import Conv1D, MaxPooling1D, Embedding, Concatenate, AveragePooling1D
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.optimizers import Adam, RMSprop, SGD
from keras.models import Model
from keras import regularizers
import keras.backend as K
import numpy as np
from keras.datasets import imdb
import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

In [2]:
import importlib
importlib.reload(preprocessing)

<module 'preprocessing' from 'D:\\jupyter-workdir\\nlp\\bg_fusion\\preprocessing.py'>

In [3]:
import os.path as osp
import pickle
wordCounter = preprocessing.WordCounter()
if not osp.isfile("E:/kaggle/avito/imdb_testset/aclImdb_v1/words_counter_list"):
    wordCounter.fit(["E:/kaggle/avito/imdb_testset/aclImdb_v1/train_test_neg.txt", "E:/kaggle/avito/imdb_testset/aclImdb_v1/train_test_pos.txt"])
    pickle.dump(wordCounter.words_list, open("E:/kaggle/avito/imdb_testset/aclImdb_v1/words_counter_list", "wb"))
else:
    wordCounter.words_list = pickle.load(open("E:/kaggle/avito/imdb_testset/aclImdb_v1/words_counter_list", "rb"))

In [4]:
if False:
    import gensim
    if not osp.isfile("E:/kaggle/avito/imdb_testset/gensim_models/imdb_word2vec"):
        sentences = gensim.models.word2vec.PathLineSentences("E:/kaggle/avito/imdb_testset/aclImdb_v1/train_test/")
        model = gensim.models.Word2Vec(sentences, size=200, window=5, min_count=1, workers=8)
        model.save("E:/kaggle/avito/imdb_testset/gensim_models/imdb_word2vec")
    else:
        model = gensim.models.Word2Vec.load("E:/kaggle/avito/imdb_testset/gensim_models/imdb_word2vec")
    emdedings = wordCounter.get_pretrain_embedding(model, num_words=10000)

In [5]:
print("words count", len(wordCounter.words_list))

words count 181924


In [6]:
def getdata(num_words=None):
    state = np.random.RandomState(0)
    X_train_pos = np.array(wordCounter.transform(["E:/kaggle/avito/imdb_testset/aclImdb_v1/train_pos.txt"], max_words=num_words))
    X_train_neg = np.array(wordCounter.transform(["E:/kaggle/avito/imdb_testset/aclImdb_v1/train_neg.txt"], max_words=num_words))
    y_train_pos = np.ones(X_train_pos.shape[0])
    y_train_neg = np.zeros(X_train_neg.shape[0])
    X_test_pos = np.array(wordCounter.transform(["E:/kaggle/avito/imdb_testset/aclImdb_v1/test_pos.txt"], max_words=num_words))
    X_test_neg = np.array(wordCounter.transform(["E:/kaggle/avito/imdb_testset/aclImdb_v1/test_neg.txt"], max_words=num_words))
    y_test_pos = np.ones(X_test_pos.shape[0])
    y_test_neg = np.zeros(X_test_neg.shape[0])
    X_train, y_train = np.concatenate([X_train_pos, X_train_neg]), np.concatenate([y_train_pos, y_train_neg])
    X_test, y_test = np.concatenate([X_test_pos, X_test_neg]), np.concatenate([y_test_pos, y_test_neg])
    train_permut = state.permutation(X_train.shape[0])
    test_permut = state.permutation(X_test.shape[0])
    return (X_train[train_permut], y_train[train_permut]), (X_test[test_permut], y_test[test_permut])

In [7]:
# (X_train, y_train), (X_test, y_test) = imdb.load_data(path="E:/paper/stackingmodel/imdb/imdb.npz", num_words=10000)
(X_train, y_train), (X_test, y_test) = getdata(num_words=30000)

In [8]:
min([len(x) for X in (X_train, X_test) for x in X])

8

In [9]:
class SpatialPyramidPooling1D(Layer):
    def __init__(self, pool_list, mode = "max", **kwargs):
        self.pool_list = np.array(pool_list)
        self.mode = mode
        assert self.pool_list.ndim == 1, "pool_list ndim must be 1"
        assert self.mode in ["max", "avg"], "mode must be either max or avg"
        self.num_outputs = sum(pool_list)
        super(SpatialPyramidPooling1D, self).__init__(**kwargs)
    
    def build(self, input_shape):
        self.gram_size = input_shape[2]
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.num_outputs * self.gram_size)
    
    def get_config(self):
        config = {'pool_list': self.pool_list, "mode": self.mode}
        base_config = super(SpatialPyramidPooling1D, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
    
    def call(self, x):
        input_shape = K.shape(x)
        gram_length = [K.cast(input_shape[1], 'float32') / i for i in self.pool_list]
        outputs = []
        for pool_index, num_pool_regions in enumerate(self.pool_list):
            for ix in range(num_pool_regions):
                x1 = ix * gram_length[pool_index]
                x2 = ix * gram_length[pool_index] + gram_length[pool_index]
                x1 = K.cast(K.round(x1), 'int32')
                x2 = K.cast(K.round(x2), 'int32')
                # new_shape = [input_shape[0], x2 - x1, input_shape[2]]
                x_crop = x[:, x1:x2, :]
                # x_crop = K.reshape(x_crop, new_shape)
                if self.mode == "max":
                    pooled_val = K.max(x_crop, axis=1)
                elif self.mode == "avg":
                    pooled_val = K.mean(x_crop, axis=1)
                outputs.append(pooled_val)
        outputs = K.concatenate(outputs)
        return outputs

In [10]:
# unsupport dynamic input size
class KMaxPooling1D(Layer):
    def __init__(self, pool_list, mode = "max", **kwargs):
        self.pool_list = np.array(pool_list)
        self.mode = mode
        assert self.pool_list.ndim == 1, "pool_list ndim must be 1"
        assert self.mode in ["max", "avg"], "mode must be either max or avg"
        self.num_outputs = sum(pool_list)
        super(KMaxPooling1D, self).__init__(**kwargs)
    
    def build(self, input_shape):
        self.gram_size = input_shape[2]
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.num_outputs * self.gram_size)
    
    def get_config(self):
        config = {'pool_list': self.pool_list, "mode": self.mode}
        base_config = super(KMaxPooling1D, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
    
    def call(self, x):
        input_shape = x.get_shape()
        input_shape_list = input_shape.as_list()
        x = tf.reshape(x, [-1, input_shape[1], input_shape[2], tf.constant(1)])
        outputs = []
        gram_length = [input_shape_list[1] / i for i in self.pool_list]
        embedding_size = input_shape_list[2]
        for pool_index, num_pool_regions in enumerate(self.pool_list):
            ph = np.round(gram_length[pool_index]).astype(np.int32)
            sh = ph
            if self.mode == "max":
                pool_result = tf.nn.max_pool(x,
                                             ksize=[1, ph, embedding_size, 1], 
                                             strides=[1, sh, 1, 1],
                                             padding='SAME')
            elif self.mode == "avg":
                pool_result = tf.nn.avg_pool(x,
                                             ksize=[1, ph, embedding_size, 1], 
                                             strides=[1, sh, 1, 1],
                                             padding='SAME')
            outputs.append(tf.reshape(pool_result, [-1, tf.constant(self.pool_list[pool_index]) * input_shape[2]]))
        outputs = K.concatenate(outputs, axis=1)
        return outputs

In [50]:
class SentenceGenerator(object):
    def __init__(self, X, y, batch_size = 125, bins_count=100, mode="train", onehot = False):
        self.batch_size = batch_size
        padding_mode = "random" if mode == "train" else "specific"
        self.pool = preprocessing.AutoPaddingInMemorySamplePool(X, chunk_size=batch_size, bins_count=bins_count, mode=padding_mode)
        self.y = y[self.pool.sorted_indices]
        self.mode = mode
        self.y_indices_record = []
        self.onehot = onehot
        if self.onehot:
            onehot_encoder = OneHotEncoder()
            self.y = self.y.reshape((-1, 1))
            self.y = onehot_encoder.fit_transform(self.y).toarray()
    
    def iter(self):
        return self
    
    def reset(self):
        self.pool.reset()
        self.y_indices_record = []
    
    def __next__(self):
        batch_samples = self.pool.__next__()
        if self.mode == "test":
            self.y_indices_record.extend(self.pool.chunk_indices_list)
        return batch_samples, self.y[self.pool.chunk_indices_list]
    
    def get_test_y(self, steps):
        return self.y[self.y_indices_record[: steps * self.batch_size]]
onehot = True
SentGener_train_all = SentenceGenerator(batch_size=64, X=X_train, y=y_train, mode="train", onehot=onehot)
SentGener_train = SentenceGenerator(batch_size=64, X=X_train[:22000], y=y_train[:22000], mode="train", onehot=onehot)
SentGener_val = SentenceGenerator(batch_size=64, X=X_train[22000:], y=y_train[22000:], mode="test", bins_count=10, onehot=onehot)
SentGener_test = SentenceGenerator(batch_size=64, X=X_test, y=y_test, mode="test", onehot=onehot)

In [47]:
print(SentGener_train.pool.min_gap, SentGener_test.pool.min_gap, SentGener_train_all.pool.steps, SentGener_train.pool.steps, SentGener_test.pool.steps, min(SentGener_train.pool.bins_lens))

220 250 400 400 400 46


In [13]:
callbacks = [EarlyStopping('val_loss', patience=5, mode="min"), ModelCheckpoint("E:/kaggle/avito/imdb_testset/tf_model/spp_net_imdb.hdf5", save_best_only=True)]

In [60]:
def get_spp_embeding_model(pool_list, num_words, mode = "avg", word_size = 100, embedings = None):
    word_indices = Input(shape=[None], name="word_indices")
    if emdedings is None:
        word_embedding = Embedding(num_words, word_size)(word_indices)
    else:
        word_embedding = Embedding(num_words, word_size, weights=[embedings], trainable=False)(word_indices)
    x_flow = word_embedding
    x_flow = Dropout(0.1)(word_embedding)
    x_flow = Conv1D(512, 3, padding='same', activation='relu', strides=1)(x_flow)
    x_flow = MaxPooling1D(3)(x_flow)
    x_flow = Conv1D(256, 3, padding='same', activation='relu', strides=1)(x_flow)
    x_flow = Conv1D(128, 3, padding='same', activation='relu', strides=1)(x_flow)
    x_flow = SpatialPyramidPooling1D(pool_list=pool_list, mode=mode)(x_flow)
    x_flow = Dropout(0.1)(x_flow)
    x_flow = Dense(512, activation='relu')(x_flow)
    x_flow = Dropout(0.1)(x_flow)
    x_flow = Dense(256, activation='relu')(x_flow)
    y_output = Dense(1, activation='sigmoid')(x_flow)
    sgd = Adam(lr=1e-3)
    model = Model(inputs=[word_indices], outputs=y_output)
    model.compile(loss='mean_squared_error', optimizer=sgd, metrics=['accuracy'])
    return model

def get_spp_embeding_model_shallow(pool_list, num_words, mode = "avg", word_size = 100, embedings = None):
    word_indices = Input(shape=[None], name="word_indices")
    if embedings is None:
        word_embedding = Embedding(num_words, word_size)(word_indices)
    else:
        word_embedding = Embedding(num_words, word_size, weights=[embedings], trainable=False)(word_indices)
    x_flow = word_embedding
    # ==============================================================================
    x_flow_a = Conv1D(128, 2, padding='same', activation='relu')(x_flow)
    x_flow_b = Conv1D(128, 3, padding='same', activation='relu')(x_flow)
    x_flow_c = Conv1D(128, 4, padding='same', activation='relu')(x_flow)
    x_flow_d = Conv1D(128, 5, padding='same', activation='relu')(x_flow)
    # ==============================================================================
#     x_flow_a_a = Conv1D(128, 2, padding='same', activation='relu')(x_flow_a)
#     x_flow_a_b = Conv1D(128, 3, padding='same', activation='relu')(x_flow_a)
#     x_flow_a = Add()([x_flow_a_a, x_flow_a_b])
    
#     x_flow_b_a = Conv1D(128, 2, padding='same', activation='relu')(x_flow_b)
#     x_flow_b_b = Conv1D(128, 3, padding='same', activation='relu')(x_flow_b)
#     x_flow_b = Add()([x_flow_b_a, x_flow_b_b])
    
#     x_flow_c_a = Conv1D(128, 2, padding='same', activation='relu')(x_flow_c)
#     x_flow_c_b = Conv1D(128, 3, padding='same', activation='relu')(x_flow_c)
#     x_flow_c = Add()([x_flow_c_a, x_flow_c_b])
    
#     x_flow_d_a = Conv1D(128, 2, padding='same', activation='relu')(x_flow_d)
#     x_flow_d_b = Conv1D(128, 3, padding='same', activation='relu')(x_flow_d)
#     x_flow_d = Add()([x_flow_d_a, x_flow_d_b])
    # ==============================================================================
    x_flow = Concatenate(axis=1)([x_flow_a, x_flow_b, x_flow_c, x_flow_d])
    x_flow = SpatialPyramidPooling1D(pool_list=pool_list, mode=mode)(x_flow)
    x_flow = Dense(128, activation='relu')(x_flow)
    y_output = Dense(2, activation='softmax')(x_flow)
    # ==============================================================================
    sgd = Adam(lr=1e-3)
    model = Model(inputs=[word_indices], outputs=y_output)
    model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
    return model

In [61]:
# embed = spp_model.layers[1]
# K.get_session().run(embed.embeddings)
# embedings = K.get_session().run(spp_model.layers[1].embeddings
# np.save("E:/kaggle/avito/imdb_testset/imdb_embedding_size_300_words_30000", embedings)
# embedings = np.load("E:/kaggle/avito/imdb_testset/imdb_embedding_size_300_words_30000.npy")

In [62]:
spp_model = get_spp_embeding_model_shallow(pool_list=[1, 2, 4], num_words=30000, word_size=300, mode="max", embedings=None)
spp_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
word_indices (InputLayer)       (None, None)         0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, None, 300)    9000000     word_indices[0][0]               
__________________________________________________________________________________________________
conv1d_61 (Conv1D)              (None, None, 128)    76928       embedding_7[0][0]                
__________________________________________________________________________________________________
conv1d_62 (Conv1D)              (None, None, 128)    115328      embedding_7[0][0]                
__________________________________________________________________________________________________
conv1d_63 

In [63]:
callbacks[0].best = np.inf
callbacks[1].best = np.inf
for i in range(10):
    print("==========round-%s" % i)
    spp_model.fit_generator(SentGener_train_all, steps_per_epoch=SentGener_train_all.pool.steps, epochs=5, shuffle=True, verbose=1)
    SentGener_test.reset()
    test_y_hat = spp_model.predict_generator(SentGener_test, steps=SentGener_test.pool.steps, verbose=1)
    acc = accuracy_score(SentGener_test.y, np.round(test_y_hat + 1e-5))
    print("==========round-%s, accuracy: %s" % (i, acc))
# spp_model.load_weights("E:/kaggle/avito/imdb_testset/tf_model/spp_net_imdb.hdf5")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [55]:
SentGener_test.reset()
test_y_hat = spp_model.predict_generator(SentGener_test, steps=SentGener_test.pool.steps, verbose=1)



In [56]:
accuracy_score(SentGener_test.y, np.round(test_y_hat + 1e-5))

0.88996