In [4]:
from __future__ import division
from __future__ import absolute_import
from __future__ import print_function
import tensorflow as tf 
from keras.engine.topology import Layer
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.optimizers import Adam, RMSprop
from keras.models import Model
import keras.backend as K
import numpy as np
from keras.datasets import imdb
import preprocessing
from sklearn.metrics import accuracy_score

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [5]:
import importlib
importlib.reload(preprocessing)

<module 'preprocessing' from 'D:\\jupyter-workdir\\nlp\\bg_fusion\\preprocessing.py'>

In [7]:
import os.path as osp
import pickle
wordCounter = preprocessing.WordCounter()
if not osp.isfile("E:/kaggle/avito/imdb_testset/aclImdb_v1/words_counter_list"):
    wordCounter.fit(["E:/kaggle/avito/imdb_testset/aclImdb_v1/train_test_neg.txt", "E:/kaggle/avito/imdb_testset/aclImdb_v1/train_test_pos.txt"])
    pickle.dump(wordCounter.words_list, open("E:/kaggle/avito/imdb_testset/aclImdb_v1/words_counter_list", "wb"))
else:
    wordCounter.words_list = pickle.load(open("E:/kaggle/avito/imdb_testset/aclImdb_v1/words_counter_list", "rb"))

In [8]:
print("words count", len(wordCounter.words_list))

words count 181924


In [9]:
def getdata(num_words=None):
    state = np.random.RandomState(0)
    X_train_pos = np.array(wordCounter.transform(["E:/kaggle/avito/imdb_testset/aclImdb_v1/train_pos.txt"], max_words=num_words))
    X_train_neg = np.array(wordCounter.transform(["E:/kaggle/avito/imdb_testset/aclImdb_v1/train_neg.txt"], max_words=num_words))
    y_train_pos = np.ones(X_train_pos.shape[0])
    y_train_neg = np.zeros(X_train_neg.shape[0])
    X_test_pos = np.array(wordCounter.transform(["E:/kaggle/avito/imdb_testset/aclImdb_v1/test_pos.txt"], max_words=num_words))
    X_test_neg = np.array(wordCounter.transform(["E:/kaggle/avito/imdb_testset/aclImdb_v1/test_neg.txt"], max_words=num_words))
    y_test_pos = np.ones(X_test_pos.shape[0])
    y_test_neg = np.zeros(X_test_neg.shape[0])
    X_train, y_train = np.concatenate([X_train_pos, X_train_neg]), np.concatenate([y_train_pos, y_train_neg])
    X_test, y_test = np.concatenate([X_test_pos, X_test_neg]), np.concatenate([y_test_pos, y_test_neg])
    train_permut = state.permutation(X_train.shape[0])
    test_permut = state.permutation(X_test.shape[0])
    return (X_train[train_permut], y_train[train_permut]), (X_test[test_permut], y_test[test_permut])

In [32]:
# (X_train, y_train), (X_test, y_test) = imdb.load_data(path="E:/paper/stackingmodel/imdb/imdb.npz", num_words=10000)
(X_train, y_train), (X_test, y_test) = getdata(num_words=10000)

In [11]:
min([len(x) for X in (X_train, X_test) for x in X])

8

In [12]:
class SpatialPyramidPooling1D(Layer):
    def __init__(self, pool_list, mode = "max", **kwargs):
        self.pool_list = np.array(pool_list)
        self.mode = mode
        assert self.pool_list.ndim == 1, "pool_list ndim must be 1"
        assert self.mode in ["max", "avg"], "mode must be either max or avg"
        self.num_outputs = sum(pool_list)
        super(SpatialPyramidPooling1D, self).__init__(**kwargs)
    
    def build(self, input_shape):
        self.gram_size = input_shape[2]
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.num_outputs * self.gram_size)
    
    def get_config(self):
        config = {'pool_list': self.pool_list, "mode": self.mode}
        base_config = super(SpatialPyramidPooling1D, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
    
    def call(self, x):
        input_shape = K.shape(x)
        gram_length = [K.cast(input_shape[1], 'float32') / i for i in self.pool_list]
        outputs = []
        for pool_index, num_pool_regions in enumerate(self.pool_list):
            for ix in range(num_pool_regions):
                x1 = ix * gram_length[pool_index]
                x2 = ix * gram_length[pool_index] + gram_length[pool_index]
                x1 = K.cast(K.round(x1), 'int32')
                x2 = K.cast(K.round(x2), 'int32')
                # new_shape = [input_shape[0], x2 - x1, input_shape[2]]
                x_crop = x[:, x1:x2, :]
                # x_crop = K.reshape(x_crop, new_shape)
                if self.mode == "max":
                    pooled_val = K.max(x_crop, axis=1)
                elif self.mode == "avg":
                    pooled_val = K.mean(x_crop, axis=1)
                outputs.append(pooled_val)
        outputs = K.concatenate(outputs)
        return outputs

In [13]:
# unsupport dynamic input size
class KMaxPooling1D(Layer):
    def __init__(self, pool_list, mode = "max", **kwargs):
        self.pool_list = np.array(pool_list)
        self.mode = mode
        assert self.pool_list.ndim == 1, "pool_list ndim must be 1"
        assert self.mode in ["max", "avg"], "mode must be either max or avg"
        self.num_outputs = sum(pool_list)
        super(KMaxPooling1D, self).__init__(**kwargs)
    
    def build(self, input_shape):
        self.gram_size = input_shape[2]
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.num_outputs * self.gram_size)
    
    def get_config(self):
        config = {'pool_list': self.pool_list, "mode": self.mode}
        base_config = super(KMaxPooling1D, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
    
    def call(self, x):
        input_shape = x.get_shape()
        input_shape_list = input_shape.as_list()
        x = tf.reshape(x, [-1, input_shape[1], input_shape[2], tf.constant(1)])
        outputs = []
        gram_length = [input_shape_list[1] / i for i in self.pool_list]
        embedding_size = input_shape_list[2]
        for pool_index, num_pool_regions in enumerate(self.pool_list):
            ph = np.round(gram_length[pool_index]).astype(np.int32)
            sh = ph
            if self.mode == "max":
                pool_result = tf.nn.max_pool(x,
                                             ksize=[1, ph, embedding_size, 1], 
                                             strides=[1, sh, 1, 1],
                                             padding='SAME')
            elif self.mode == "avg":
                pool_result = tf.nn.avg_pool(x,
                                             ksize=[1, ph, embedding_size, 1], 
                                             strides=[1, sh, 1, 1],
                                             padding='SAME')
            outputs.append(tf.reshape(pool_result, [-1, tf.constant(self.pool_list[pool_index]) * input_shape[2]]))
        outputs = K.concatenate(outputs, axis=1)
        return outputs

In [33]:
class SentenceGenerator(object):
    def __init__(self, X, y, batch_size = 125, bins_count=100, mode="train"):
        self.batch_size = batch_size
        padding_mode = "random" if mode == "train" else "specific"
        self.pool = preprocessing.AutoPaddingInMemorySamplePool(X, chunk_size=batch_size, bins_count=bins_count, mode=padding_mode)
        self.y = y[self.pool.sorted_indices]
        self.mode = mode
        self.y_indices_record = []
    
    def iter(self):
        return self
    
    def reset(self):
        self.pool.reset()
        self.y_indices_record = []
    
    def __next__(self):
        batch_samples = self.pool.__next__()
        if self.mode == "test":
            self.y_indices_record.extend(self.pool.chunk_indices_list)
        return batch_samples, self.y[self.pool.chunk_indices_list]
    
    def get_test_y(self, steps):
        return self.y[self.y_indices_record[: steps * self.batch_size]]
SentGener_train = SentenceGenerator(X_train[:22000], y_train[:22000])
SentGener_val = SentenceGenerator(X_train[22000:], y_train[22000:], mode="test", bins_count=10)
SentGener_test = SentenceGenerator(X_test, y_test, mode="test")

In [34]:
print(SentGener_train.pool.min_gap, SentGener_test.pool.min_gap, SentGener_train.pool.steps, SentGener_test.pool.steps, min(SentGener_train.pool.bins_lens))

220 250 200 200 46


In [None]:
callbacks = [EarlyStopping('val_loss', patience=5, mode="min"), ModelCheckpoint("E:/kaggle/avito/imdb_testset/tf_model/spp_net_imdb.hdf5", save_best_only=True)]

In [70]:
def get_spp_embeding_model(pool_list, num_words, mode = "avg", word_size = 100):
    word_indices = Input(shape=[None], name="word_indices")
    word_embedding = Embedding(num_words, word_size)(word_indices)
    x_flow = Dropout(0.1)(word_embedding)
    x_flow = Conv1D(512, 3, padding='same', activation='relu', strides=1)(x_flow)
    x_flow = MaxPooling1D(3, padding='valid')(x_flow)
    x_flow = Conv1D(256, 3, padding='same', activation='relu', strides=1)(x_flow)
    x_flow = Conv1D(128, 3, padding='same', activation='relu', strides=1)(x_flow)
    x_flow = SpatialPyramidPooling1D(pool_list=pool_list, mode=mode)(x_flow)
    x_flow = Dropout(0.1)(x_flow)
    x_flow = Dense(512, activation='relu')(x_flow)
    x_flow = Dropout(0.1)(x_flow)
    x_flow = Dense(256, activation='relu')(x_flow)
    y_output = Dense(1, activation='sigmoid')(x_flow)
    sgd = Adam(lr=1e-3)
#     sgd = SGD(lr=1e-3, decay=1e-6, momentum=0.9, nesterov=True)
    model = Model(inputs=[word_indices], outputs=y_output)
    model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
    return model

def get_kmaxpooling_embedding_model(pool_list, num_words, mode = "avg", word_size = 100):
    word_indices = Input(shape=[1256], name="word_indices")
    word_embedding = Embedding(num_words, word_size)(word_indices)
    x_flow = Dropout(0.1)(word_embedding)
    x_flow = Conv1D(256, 3, padding='same', activation='relu', strides=1)(x_flow)
    x_flow = MaxPooling1D(3)(x_flow)
    x_flow = KMaxPooling1D(pool_list=pool_list, mode=mode)(x_flow)
    x_flow = Dense(word_size, activation='relu')(x_flow)
    y_output = Dense(1, activation='sigmoid')(x_flow)
    sgd = Adam(lr=1e-3)
    model = Model(inputs=[word_indices], outputs=y_output)
    model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
    return model

In [71]:
# embed = spp_model.layers[1]
# K.get_session().run(embed.embeddings)

In [72]:
spp_model = get_spp_embeding_model(pool_list=[1, 2, 4], num_words=10000, word_size=200, mode="max")
spp_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
word_indices (InputLayer)    (None, None)              0         
_________________________________________________________________
embedding_10 (Embedding)     (None, None, 200)         2000000   
_________________________________________________________________
dropout_26 (Dropout)         (None, None, 200)         0         
_________________________________________________________________
conv1d_26 (Conv1D)           (None, None, 512)         307712    
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, None, 512)         0         
_________________________________________________________________
conv1d_27 (Conv1D)           (None, None, 256)         393472    
_________________________________________________________________
conv1d_28 (Conv1D)           (None, None, 128)         98432     
__________

In [73]:
spp_model.fit_generator(SentGener_train, steps_per_epoch=np.ceil(X_train.shape[0] / SentGener_train.batch_size).astype(np.int32), epochs=20, shuffle=True, verbose=1, 
                        callbacks=callbacks,
                        validation_data=SentGener_val,
                        validation_steps=SentGener_val.pool.steps
                       )
spp_model.load_weights("E:/kaggle/avito/imdb_testset/tf_model/spp_net_imdb.hdf5")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20


In [74]:
SentGener_test.reset()
test_y_hat = spp_model.predict_generator(SentGener_test, steps=SentGener_test.pool.steps, verbose=1)



In [75]:
accuracy_score(SentGener_test.y, np.round(test_y_hat + 1e-5))

0.87208