## Download rnn_merged.zip & rnn_embed.zip from https://drive.google.com/drive/folders/1yO_W-m0fF_PludrnScdgyTGsPFoDsA6_?usp=sharing and unzip to the same folder of this file

## Also download train_jpg.zip & test_jpg.zip from competition website

In [1]:
import pandas as pd
import tensorflow as tf
from keras.preprocessing import text, sequence
import numpy as np
from keras.layers import Input, SpatialDropout1D,Dropout, GlobalAveragePooling1D, GlobalMaxPooling1D, \
                            CuDNNGRU, GRU, Bidirectional, LSTM, Dense, Embedding, concatenate, Embedding, \
                            Flatten, Activation, BatchNormalization, regularizers, Conv1D, Conv2D, MaxPooling2D
from keras.constraints import max_norm

from keras.initializers import Orthogonal
from keras.models import Model
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, LambdaCallback, Callback, LearningRateScheduler
import keras.backend as K
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
import os
import pickle
import gc; gc.enable()
import matplotlib 
import matplotlib.pyplot as plt
%matplotlib inline
import string
import nltk
from nltk.corpus import stopwords                
from nltk.stem.snowball import RussianStemmer
from scipy.stats import boxcox
import re
#from tqdm import tqdm

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Check GPU Availability

In [2]:
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
K.tensorflow_backend._get_available_gpus()

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 15244745819299257413
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 3174131302
locality {
  bus_id: 1
  links {
  }
}
incarnation: 4980011148626425272
physical_device_desc: "device: 0, name: GeForce GTX 1050 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1"
]


['/job:localhost/replica:0/task:0/device:GPU:0']

### Preprocess Training and Testing Data

In [3]:
seed = 411
rnn_train_epochs = 8
batch_size=64 # 32 or 64 is good (too huge for my PC), 128 is worse in the past experiments
cpu_count=4

In [4]:
features = pickle.load(open('rnn_merged.pkl', 'rb'))
features.keys()

dict_keys(['categorical', 'y_train', 'train', 'test'])

In [5]:
train = features['train']
test = features['test']

renamed_cols = []
count = 0
for col in train.columns:
    if 'cat_features_user_id_category_name' in col:
        col = 'cat_features_user_id_category_name_'+str(count)
        count += 1
    renamed_cols.append(col)
train.columns = renamed_cols
test.columns = renamed_cols

train_len = train.shape[0]
train_y = features['y_train']
categorical = features['categorical']
numerical = [f for f in train.columns if f not in categorical]
features = numerical + categorical

In [6]:
train.columns.tolist()

['price',
 'image_top_1',
 'item_seq_number',
 'activation_weekday',
 'avg_days_up_user',
 'avg_times_up_user',
 'n_user_items',
 'cat_features_user_id_category_name_0',
 'cat_features_user_id_category_name_1',
 'cat_features_user_id_category_name_2',
 'cat_features_user_id_category_name_3',
 'cat_features_user_id_category_name_4',
 'cat_features_user_id_category_name_5',
 'cat_features_user_id_category_name_6',
 'cat_features_user_id_category_name_7',
 'cat_features_user_id_category_name_8',
 'cat_features_user_id_category_name_9',
 'cat_features_user_id_category_name_10',
 'cat_features_user_id_category_name_11',
 'cat_features_user_id_category_name_12',
 'cat_features_user_id_category_name_13',
 'cat_features_user_id_category_name_14',
 'cat_features_user_id_category_name_15',
 'cat_features_user_id_category_name_16',
 'cat_features_user_id_category_name_17',
 'cat_features_user_id_category_name_18',
 'cat_features_user_id_category_name_19',
 'cat_features_user_id_category_name_20',

In [7]:
# remove features: text, image, other embeddings\feature engineerings
remove_cols = [
    'cat_features_user_id_category_name_0',
    'cat_features_user_id_category_name_1',
    'cat_features_user_id_category_name_2',
    'cat_features_user_id_category_name_3',
    'cat_features_user_id_category_name_4',
    'cat_features_user_id_category_name_5',
    'cat_features_user_id_category_name_6',
    'cat_features_user_id_category_name_7',
    'cat_features_user_id_category_name_8',
    'cat_features_user_id_category_name_9',
    'cat_features_user_id_category_name_10',
    'cat_features_user_id_category_name_11',
    'cat_features_user_id_category_name_12',
    'cat_features_user_id_category_name_13',
    'cat_features_user_id_category_name_14',
    'cat_features_user_id_category_name_15',
    'cat_features_user_id_category_name_16',
    'cat_features_user_id_category_name_17',
    'cat_features_user_id_category_name_18',
    'cat_features_user_id_category_name_19',
    'cat_features_user_id_category_name_20',
    'cat_features_user_id_category_name_21',
    'cat_features_user_id_category_name_22',
    'cat_features_user_id_category_name_23',
    'cat_features_user_id_category_name_24',
    'cat_features_user_id_category_name_25',
    'cat_features_user_id_category_name_26',
    'cat_features_user_id_category_name_27',
    'cat_features_user_id_category_name_28',
    'cat_features_user_id_category_name_29',
    'cat_features_user_id_category_name_30',
    'cat_features_user_id_category_name_31',
    'cat_features_user_id_category_name_32',
    'cat_features_user_id_category_name_33',
    'cat_features_user_id_category_name_34',
    'cat_features_user_id_category_name_35',
    'cat_features_user_id_category_name_36',
    'cat_features_user_id_category_name_37',
    'cat_features_user_id_category_name_38',
    'cat_features_user_id_category_name_39',
    'cat_features_user_id_category_name_40',
    'cat_features_user_id_category_name_41',
    'cat_features_user_id_category_name_42',
    'cat_features_user_id_category_name_43',
    'cat_features_user_id_category_name_44',
    'cat_features_user_id_category_name_45',
    'cat_features_user_id_category_name_46',
    'title_tfidf_svd_1',
    'title_tfidf_svd_2',
    'title_tfidf_svd_3',
    'title_tfidf_svd_4',
    'title_tfidf_svd_5',
    'description_tfidf_svd_1',
    'description_tfidf_svd_2',
    'description_tfidf_svd_3',
    'description_tfidf_svd_4',
    'description_tfidf_svd_5',
    'region_mean_price',
    'region_mean_image_top_1',
    'region_mean_item_seq_number',
    'region_mean_price_pred',
    'region_mean_price_pred_all',
    'region_mean_ridge_preds',
    'city_mean_price',
    'city_mean_image_top_1',
    'city_mean_item_seq_number',
    'city_mean_price_pred',
    'city_mean_price_pred_all',
    'city_mean_ridge_preds',
    'parent_category_name_mean_price',
    'parent_category_name_mean_image_top_1',
    'parent_category_name_mean_item_seq_number',
    'parent_category_name_mean_price_pred',
    'parent_category_name_mean_price_pred_all',
    'parent_category_name_mean_ridge_preds',
    'category_name_mean_price',
    'category_name_mean_image_top_1',
    'category_name_mean_item_seq_number',
    'category_name_mean_price_pred',
    'category_name_mean_price_pred_all',
    'category_name_mean_ridge_preds',
    'user_type_mean_price',
    'user_type_mean_image_top_1',
    'user_type_mean_item_seq_number',
    'user_type_mean_price_pred',
    'user_type_mean_price_pred_all',
    'user_type_mean_ridge_preds',
    'param_1_mean_price',
    'param_1_mean_image_top_1',
    'param_1_mean_item_seq_number',
    'param_1_mean_price_pred',
    'param_1_mean_price_pred_all',
    'param_1_mean_ridge_preds',
    'param_2_mean_price',
    'param_2_mean_image_top_1',
    'param_2_mean_item_seq_number',
    'param_2_mean_price_pred',
    'param_2_mean_price_pred_all',
    'param_2_mean_ridge_preds',
    'param_3_mean_price',
    'param_3_mean_image_top_1',
    'param_3_mean_item_seq_number',
    'param_3_mean_price_pred',
    'param_3_mean_price_pred_all',
    'param_3_mean_ridge_preds',
    'user_id_nunique_parent_category_name',
    'user_id_nunique_category_name',
    'user_id_nunique_param_1',
    'user_id_nunique_param_2',
    'user_id_nunique_param_3',
    'user_id_nunique_activation_date',
    'user_id_activation_date_count_item_id',
    'image_top_1_nunique_item_id',
    'image_top_1_nunique_user_id',
    'image_top_1_nunique_category_name',
    'image_top_1_nunique_param_1',
    'image_top_1_nunique_item_seq_number',
    'image_top_1_mean_price_pred',
    'image_top_1_std_price_pred',
    'image_top_1_mean_item_seq_number',
    'user_id_mean_ridge_preds',
    'user_id_category_name_mean_ridge_preds',
    'user_id_image_top_1_mean_ridge_preds',
    'user_id_category_name_sum_ridge_preds',
    'cityxcatxusertypeitem_num',
    'cityxcatxusertypecity_fm_factor_0',
    'cityxcatxusertypecity_fm_factor_1',
    'cityxcatxusertypecategory_name_fm_factor_0',
    'cityxcatxusertypecategory_name_fm_factor_1',
    'cityxcatxusertypeuser_type_fm_factor_0',
    'cityxcatxusertypeuser_type_fm_factor_1',
    'cityxcatxusertypecity_fm_bias',
    'cityxcatxusertypecategory_name_fm_bias',
    'cityxcatxusertypeuser_type_fm_bias',
    'imgxcityxcatitem_num',
    'imgxcityxcatimage_top_1_fm_factor_0',
    'imgxcityxcatimage_top_1_fm_factor_1',
    'imgxcityxcatcity_fm_factor_0',
    'imgxcityxcatcity_fm_factor_1',
    'imgxcityxcatcategory_name_fm_factor_0',
    'imgxcityxcatcategory_name_fm_factor_1',
    'imgxcityxcatimage_top_1_fm_bias',
    'imgxcityxcatcity_fm_bias',
    'imgxcityxcatcategory_name_fm_bias',
    'imgxisqnxusertypeitem_num',
    'imgxisqnxusertypeimage_top_1_fm_factor_0',
    'imgxisqnxusertypeimage_top_1_fm_factor_1',
    'imgxisqnxusertypeitem_seq_number_fm_factor_0',
    'imgxisqnxusertypeitem_seq_number_fm_factor_1',
    'imgxisqnxusertypeuser_type_fm_factor_0',
    'imgxisqnxusertypeimage_top_1_fm_bias',
    'imgxisqnxusertypeitem_seq_number_fm_bias',
    'b_intensity_mean',
    'b_intensity_median',
    'b_intensity_std',
    'g_intensity_mean',
    'g_intensity_median',
    'g_intensity_std',
    'gray_intensity_mean',
    'gray_intensity_median',
    'gray_intensity_std',
    'r_intensity_mean',
    'r_intensity_median',
    'r_intensity_std',
    'nasnet_nima_med',
    'nasnet_nima_std',
    'nasnet_nima_max',
    'nasnet_nima_min',
    'nasnet_nima_1_quartile',
    'nasnet_nima_3_quartile',
    'nasnet_nima_13_quartile_diff',
    'nasnet_nima_max_min_diff',
    'nasnet_nima_non_max_mean',
    'nasnet_nima_max_non_max_mean_diff',
]

In [8]:
'''
train.drop(remove_cols, axis=1, inplace=True)
test.drop(remove_cols, axis=1, inplace=True)

for col in remove_cols:
    if col in categorical:
        categorical.remove(col)
    if col in numerical:
        numerical.remove(col)
        
features = numerical + categorical
'''

'\ntrain.drop(remove_cols, axis=1, inplace=True)\ntest.drop(remove_cols, axis=1, inplace=True)\n\nfor col in remove_cols:\n    if col in categorical:\n        categorical.remove(col)\n    if col in numerical:\n        numerical.remove(col)\n        \nfeatures = numerical + categorical\n'

In [9]:
train.loc[:, 'image'] = pd.read_csv('train.csv', usecols=['activation_date', 'image'], parse_dates=['activation_date']) \
                          .sort_values('activation_date').reset_index(drop=True)['image'].fillna('no-image')
test.loc[:, 'image'] = pd.read_csv('test.csv', usecols=['image'])['image'].fillna('no-image')

In [10]:
max_features = 500000
maxlen = 150
embed_size = 300

title_max_features = 200000
title_maxlen = 80
title_embed_size = 100

In [11]:
embed_info = pickle.load(open('rnn_embed.pkl', 'rb'))
embed_info.keys()

dict_keys(['desc_embed_info', 'title_embed_info'])

In [12]:
desc_embed_info = embed_info['desc_embed_info']
title_embed_info = embed_info['title_embed_info']

In [13]:
print('setup max info for embedding in categorical variables')
max_info = dict((col, train[col].max()+1) for col in categorical)

setup max info for embedding in categorical variables


### Build RNN Model

In [14]:
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_true - y_pred)))

In [15]:
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        print(input_shape)
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [16]:
def clip_rmse(true, prediction):
    return np.sqrt(metrics.mean_squared_error(true, np.clip(prediction, 0., 1.)))
    
class NBatchEvalLogger(Callback):
    def __init__(self, display, val_X, val_y, save_path=None, save_start=1000):
        self.step = 0
        self.display = display
        self.val_X = val_X
        self.val_y = val_y
        self.best_loss = None
        self.save_path = save_path
        self.save_start = save_start
        self.record_count = 0
        
    def on_batch_end(self, batch, logs={}):
        self.step += 1
        if self.step % self.display == 0 and self.step >= self.save_start:
            #loss, metric = self.model.evaluate(self.val_X, self.val_y, batch_size=128, verbose=1)
            prediction = self.model.predict(self.val_X, batch_size=128, verbose=0)
            loss = clip_rmse(self.val_y, prediction)
            
            if self.best_loss is None:
                self.best_loss = loss
            else:
                if loss < self.best_loss:
                    self.best_loss = loss
                    if self.save_path is not None:
                        self.model.save(self.save_path, overwrite=True)
                        self.record_count += 1
                    
            print('\rstep: {} val loss={:.5f}, best loss={:.5f}'.format(self.step, loss, self.best_loss))

In [17]:
import keras
from copy import deepcopy as cp
import os
from zipfile import ZipFile
import cv2
import numpy as np
import pandas as pd
from dask import bag, threaded
from dask.diagnostics import ProgressBar
import matplotlib.pyplot as plt
from keras.preprocessing.image import load_img, img_to_array
from keras.applications.resnet50 import preprocess_input
import concurrent.futures
from multiprocessing.pool import ThreadPool

class DataGenerator(keras.utils.Sequence):
    #'Generates data for Keras'
    def __init__(self, list_IDs, X, y, img_arch, img_path, batch_size=32, shuffle=True, is_train=True):
        #'Initialization'
        self.batch_size = batch_size
        self.X = X
        self.y = y
        self.list_IDs = list_IDs
        self.shuffle = shuffle
        self.img_path = img_path
        self.is_train = is_train
        self.on_epoch_end()
        self.zipped = ZipFile(img_arch)
        print('file names:\n', self.zipped.namelist()[1:10], '\n...')
        self.img_path = img_path
        
        global cpu_count
        self.pool = ThreadPool(cpu_count)
        
    def __len__(self):
        #'Denotes the number of batches per epoch'
        return int(np.ceil(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        #'Generate one batch of data'
        # Generate indexes of the batch
        start = index*self.batch_size
        end = min((index+1)*self.batch_size, len(self.indexes))
        indexes = self.indexes[start: end]

        # Generate data
        return self.__data_generation(indexes)

    def on_epoch_end(self):
        #'Updates indexes after each epoch'
        self.indexes = cp(list(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def load_img_from_zipped(self, img_id, i, imgs_holder):
        
        invalid_img_ids = ['4f029e2a00e892aa2cac27d98b52ef8b13d91471f613c8d3c38e3f29d4da0b0c', 
                           '8513a91e55670c709069b5f85e12a59095b802877715903abef16b7a6f306e58', 
                           '60d310a42e87cdf799afcd89dc1b11ae3fdc3d0233747ec7ef78d82c87002e83', 
                           'b98b291bd04c3d92165ca515e00468fd9756af9a8f1df42505deed1dcfb5d7ae']
        try:
            if img_id in invalid_img_ids or img_id == 'no-image':
                pass
            else:
                exfile = self.zipped.read(self.img_path+img_id+'.jpg')
                arr = np.frombuffer(exfile, np.uint8)
                imz = cv2.imdecode(arr, flags=cv2.IMREAD_UNCHANGED)
                imz = cv2.resize(imz, (224,224), interpolation=cv2.INTER_AREA)
                imgs_holder[i] = img_to_array(imz)
        except:
            print(img_id, ' is invalid')
            pass
            
        return None
    
    def parallel_load_imgs(self, img_ids):
        
        imgs_holder = np.zeros((len(img_ids), 224, 224, 3))
        res = [self.pool.apply_async(self.load_img_from_zipped, (im_id, i, imgs_holder)) for i, im_id in enumerate(img_ids)]
        for r in res:
            r.get()
            
        #print(imgs_holder)
        imgs_holder = preprocess_input(imgs_holder) # adjust to mean of rgb to some value
        
        return imgs_holder
    
    def __data_generation(self, list_IDs_temp):
        #'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        
        # Generate data
        X = dict((col, self.X.loc[list_IDs_temp, col].values) for col in features)
        X['desc'] = desc_embed_info['text'][list_IDs_temp,:]
        X['title'] = title_embed_info['text'][list_IDs_temp,:]
        #X['imgs'] = self.parallel_load_imgs(self.X.loc[list_IDs_temp, 'image'].values)
        
        if self.is_train:
            y = cp(self.y[list_IDs_temp])
            return X, y
        else:
            return X

In [18]:
# 'train_jpg.zip', 'data/competition_files/train_jpg/', 
# debug use
'''
zipped = ZipFile('train_jpg.zip')
print(zipped.namelist()[1:10])

img_id = '2809fd6afd6d3cae4dd4ad93a7f905a0db32292f4df4b3f19fa5492e08cbfd90'
target_size=(224,224)
try:
    exfile = zipped.read('data/competition_files/train_jpg/'+img_id+'.jpg')
    arr = np.frombuffer(exfile, np.uint8)
    imz = cv2.imdecode(arr, flags=cv2.IMREAD_UNCHANGED)
    imz = cv2.resize(imz, target_size, interpolation=cv2.INTER_AREA)
except:
    print(img_id, ' is invalid')
    imz = None
imz
'''

"\nzipped = ZipFile('train_jpg.zip')\nprint(zipped.namelist()[1:10])\n\nimg_id = '2809fd6afd6d3cae4dd4ad93a7f905a0db32292f4df4b3f19fa5492e08cbfd90'\ntarget_size=(224,224)\ntry:\n    exfile = zipped.read('data/competition_files/train_jpg/'+img_id+'.jpg')\n    arr = np.frombuffer(exfile, np.uint8)\n    imz = cv2.imdecode(arr, flags=cv2.IMREAD_UNCHANGED)\n    imz = cv2.resize(imz, target_size, interpolation=cv2.INTER_AREA)\nexcept:\n    print(img_id, ' is invalid')\n    imz = None\nimz\n"

In [19]:
def build_model(categorical_features, numerical_features):
    
    # non-cat features
    non_cat_inputs = []
    for col in numerical_features:
        f = Input(shape=[1], name=col)
        non_cat_inputs.append(f)
    
    non_cat_ch = concatenate(non_cat_inputs)
    non_cat_ch = Dense(64, activation='relu')(non_cat_ch)
    
    # cat features
    cat_inputs = []
    cat_embeds = []
    for col in categorical_features:
        f = Input(shape=[1], name=col)
        embed_dim = max_info[col].max()
        if max_info[col] > 10:
            reduced_dim = 10
        else:
            reduced_dim = 1
        embed_f = Embedding(embed_dim, reduced_dim)(f)
        flatten_f = Flatten()(embed_f)
        cat_inputs.append(f)
        cat_embeds.append(flatten_f)
    
    cat_ch = concatenate(cat_embeds)
    cat_ch = Dense(64, activation='relu')(cat_ch)
    
    
    # text features: architecture of text to try here!!!
    
    # description
    text_inp = Input(shape = (maxlen, ), name='desc')
    text_emb = Embedding(desc_embed_info['nb_words'], embed_size, weights = [desc_embed_info['emb_matrix']],
                    input_length = maxlen, trainable = False)(text_inp)
    text_emb = SpatialDropout1D(0.3)(text_emb)
    text_gru = Bidirectional(CuDNNGRU(128, return_sequences = True))(text_emb)
    text_gru = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(text_gru)
    text_gru_avg = GlobalAveragePooling1D()(text_gru)
    text_gru_max = GlobalMaxPooling1D()(text_gru)
    text_gru = concatenate([text_gru_avg, text_gru_max]) 
    text_gru = Dropout(0.1)(text_gru)
    
    # title
    title_inp = Input(shape = (title_maxlen, ), name='title')
    title_emb = Embedding(title_embed_info['nb_words'], title_embed_size, weights = [title_embed_info['emb_matrix']],
                    input_length = title_maxlen, trainable = False)(title_inp)
    title_emb = SpatialDropout1D(0.1)(title_emb)
    title_gru = Bidirectional(CuDNNGRU(32, return_sequences = True))(title_emb)
    title_gru = Conv1D(16, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(title_gru)
    title_gru_avg = GlobalAveragePooling1D()(title_gru)
    title_gru_max = GlobalMaxPooling1D()(title_gru)
    title_gru = concatenate([title_gru_avg, title_gru_max]) 
    title_gru = Dropout(0.1)(title_gru)
    
    # add image architecture
    # reference: https://keras.io/getting-started/functional-api-guide/#more-examples, Visual question answering model
    '''
    img_inp = Input(shape = (224, 224, 3 ), name='imgs')
    img_ch = Conv2D(64, (3, 3), activation='relu', padding='same', W_constraint=max_norm(3))(img_inp)
    img_ch = Conv2D(64, (3, 3), activation='relu')(img_ch)
    img_ch = MaxPooling2D((2, 2))(img_ch)
    #img_ch = Conv2D(128, (3, 3), activation='relu', padding='same', W_constraint=max_norm(3))(img_ch)
    #img_ch = Conv2D(128, (3, 3), activation='relu')(img_ch)
    #img_ch = MaxPooling2D((2, 2))(img_ch)
    #img_ch = Conv2D(256, (3, 3), activation='relu', padding='same', W_constraint=max_norm(3))(img_ch)
    #img_ch = Conv2D(256, (3, 3), activation='relu')(img_ch)
    #img_ch = Conv2D(256, (3, 3), activation='relu')(img_ch)
    #img_ch = MaxPooling2D((2, 2))(img_ch)
    img_ch = Flatten()(img_ch)
    img_ch = Dense(64, activation='relu')(img_ch)
    '''
    
    # merge each branch: non-cat, cat, text, img
    concat_main = [non_cat_ch, cat_ch, text_gru, title_gru]
    main = concatenate(concat_main)
    main = BatchNormalization()(main)
    main = Dropout(0.1)(main)
    main = BatchNormalization()(Dense(128, activation='relu')(main))
    out = Dense(1, activation = "sigmoid")(main)

    concat_input = non_cat_inputs+cat_inputs+[text_inp, title_inp]
    model = Model(concat_input, out)
    model.regularizers = [regularizers.l2(0.0001)]
    model.compile(optimizer = Adam(lr=0.001), loss = root_mean_squared_error,
                  metrics =[root_mean_squared_error])
    model.summary()
    return model

### Training

In [20]:
from sklearn.model_selection import KFold
import warnings; warnings.filterwarnings('ignore') 

In [21]:
train_indices = np.arange(0, train_len)
test_indices = np.arange(0, test.shape[0])

In [22]:
from keras_tqdm import TQDMNotebookCallback
from ipywidgets import IntProgress

In [23]:
start_fold = 4 # <= 0 for invalid, train from fold 1, > 0: used to train from fold=start_fold
resume_file_prefix = '0619_rnn' # whatever we like

In [24]:
if start_fold > 0:
    import pickle
    ret = pickle.load(open(resume_file_prefix+'_oof_val_pred', 'rb'))
    ret_test = pickle.load(open(resume_file_prefix+'_oof_test_pred', 'rb'))
    print(ret)
    print(ret_test)
else:
    ret = np.zeros((train.shape[0],))
    ret_test = np.zeros((test.shape[0],))

fold = 0    
for tr_ix, val_ix in KFold(5, shuffle=True, random_state=seed).split(train_indices):
    fold += 1
    
    if start_fold > 0 and fold < start_fold:
        continue
    else:
        pass
    
    model = build_model(categorical, numerical)
    file_path = "rnn_weights/model_final_fold_{}.hdf5".format(fold)
     
    # customized batch loader
    training_generator = DataGenerator(tr_ix, train, train_y, 
                                       'train_jpg.zip', 'data/competition_files/train_jpg/', 
                                       batch_size=batch_size, shuffle=True)
    validation_generator = DataGenerator(val_ix, train, train_y, 
                                         'train_jpg.zip', 'data/competition_files/train_jpg/', 
                                         batch_size=batch_size, shuffle=False)

    lr_schd = LearningRateScheduler(lambda epoch: 0.001*(0.2**(epoch//5)), verbose=1)
    check_point = ModelCheckpoint(file_path, monitor = "val_loss", mode = "min", save_best_only = True, verbose = 1)
    history = model.fit_generator(generator=training_generator,
                                  validation_data=validation_generator,
                                  use_multiprocessing=False,
                                  workers=1, 
                                  epochs=rnn_train_epochs,
                                  verbose = 0, 
                                  callbacks = [lr_schd, check_point, TQDMNotebookCallback(leave_inner=True, leave_outer=True)])
    
    
    # Predict val + test oofs
    model.load_weights(file_path) # load weight with best validation score
    
    del validation_generator
    validation_generator = DataGenerator(val_ix, train, None, 
                                         'train_jpg.zip', 'data/competition_files/train_jpg/', 
                                         batch_size=batch_size, shuffle=False, is_train=False)
    test_generator = DataGenerator(test_indices, test, None, 
                                   'test_jpg.zip', 'data/competition_files/test_jpg/',       
                                   batch_size=batch_size, shuffle=False, is_train=False)
    
    ret[val_ix] = model.predict_generator(validation_generator, use_multiprocessing=False, workers=1).reshape((len(val_ix),))
    ret_test += model.predict_generator(test_generator, use_multiprocessing=False, workers=1).reshape((ret_test.shape[0],))
    
    del model, history, training_generator, validation_generator, test_generator; gc.collect()
    
ret_test /= 5

[0.         0.         0.03891562 ... 0.19570075 0.         0.18555354]
[0.68953376 0.27252602 0.57872966 ... 0.12527748 1.48004061 0.29635632]
Instructions for updating:
Use the retry module or similar alternatives.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
desc (InputLayer)               (None, 150)          0                                            
__________________________________________________________________________________________________
title (InputLayer)              (None, 80)           0                                            
__________________________________________________________________________________________________
embedding_13 (Embedding)        (None, 150, 300)     150000000   desc[0][0]                       
_______

__________________________________________________________________________________________________
cat_features_user_id_category_n (None, 1)            0                                            
__________________________________________________________________________________________________
cat_features_user_id_category_n (None, 1)            0                                            
__________________________________________________________________________________________________
cat_features_user_id_category_n (None, 1)            0                                            
__________________________________________________________________________________________________
cat_features_user_id_category_n (None, 1)            0                                            
__________________________________________________________________________________________________
cat_features_user_id_category_n (None, 1)            0                                            
__________

__________________________________________________________________________________________________
image_top_1_nunique_item_id (In (None, 1)            0                                            
__________________________________________________________________________________________________
image_top_1_nunique_user_id (In (None, 1)            0                                            
__________________________________________________________________________________________________
image_top_1_nunique_category_na (None, 1)            0                                            
__________________________________________________________________________________________________
image_top_1_nunique_param_1 (In (None, 1)            0                                            
__________________________________________________________________________________________________
image_top_1_nunique_item_seq_nu (None, 1)            0                                            
__________

__________________________________________________________________________________________________
flatten_7 (Flatten)             (None, 10)           0           embedding_7[0][0]                
__________________________________________________________________________________________________
flatten_8 (Flatten)             (None, 10)           0           embedding_8[0][0]                
__________________________________________________________________________________________________
flatten_9 (Flatten)             (None, 10)           0           embedding_9[0][0]                
__________________________________________________________________________________________________
flatten_10 (Flatten)            (None, 10)           0           embedding_10[0][0]               
__________________________________________________________________________________________________
flatten_11 (Flatten)            (None, 10)           0           embedding_11[0][0]               
__________

                                                                 latitude[0][0]                   
                                                                 longitude[0][0]                  
                                                                 cityxcatxusertypeitem_num[0][0]  
                                                                 cityxcatxusertypecity_fm_factor_0
                                                                 cityxcatxusertypecity_fm_factor_1
                                                                 cityxcatxusertypecategory_name_fm
                                                                 cityxcatxusertypecategory_name_fm
                                                                 cityxcatxusertypeuser_type_fm_fac
                                                                 cityxcatxusertypeuser_type_fm_fac
                                                                 cityxcatxusertypecity_fm_bias[0][
          

HBox(children=(IntProgress(value=0, description='Training', max=8), HTML(value='')))


Epoch 00001: LearningRateScheduler reducing learning rate to 0.001.


HBox(children=(IntProgress(value=0, description='Epoch 0', max=18793), HTML(value='')))


Epoch 00001: val_loss improved from inf to 0.22979, saving model to rnn_weights/model_final_fold_4.hdf5

Epoch 00002: LearningRateScheduler reducing learning rate to 0.001.


HBox(children=(IntProgress(value=0, description='Epoch 1', max=18793), HTML(value='')))


Epoch 00002: val_loss improved from 0.22979 to 0.21667, saving model to rnn_weights/model_final_fold_4.hdf5

Epoch 00003: LearningRateScheduler reducing learning rate to 0.001.


HBox(children=(IntProgress(value=0, description='Epoch 2', max=18793), HTML(value='')))


Epoch 00003: val_loss improved from 0.21667 to 0.21600, saving model to rnn_weights/model_final_fold_4.hdf5

Epoch 00004: LearningRateScheduler reducing learning rate to 0.001.


HBox(children=(IntProgress(value=0, description='Epoch 3', max=18793), HTML(value='')))


Epoch 00004: val_loss improved from 0.21600 to 0.21423, saving model to rnn_weights/model_final_fold_4.hdf5

Epoch 00005: LearningRateScheduler reducing learning rate to 0.001.


HBox(children=(IntProgress(value=0, description='Epoch 4', max=18793), HTML(value='')))


Epoch 00005: val_loss did not improve from 0.21423

Epoch 00006: LearningRateScheduler reducing learning rate to 0.0002.


HBox(children=(IntProgress(value=0, description='Epoch 5', max=18793), HTML(value='')))


Epoch 00006: val_loss improved from 0.21423 to 0.21334, saving model to rnn_weights/model_final_fold_4.hdf5

Epoch 00007: LearningRateScheduler reducing learning rate to 0.0002.


HBox(children=(IntProgress(value=0, description='Epoch 6', max=18793), HTML(value='')))


Epoch 00007: val_loss did not improve from 0.21334

Epoch 00008: LearningRateScheduler reducing learning rate to 0.0002.


HBox(children=(IntProgress(value=0, description='Epoch 7', max=18793), HTML(value='')))


Epoch 00008: val_loss did not improve from 0.21334

file names:
 ['data/competition_files/train_jpg/0b8eed559572527d972b4d959e8f4c107fdd9bc19cca04903854ac315f74615e.jpg', 'data/competition_files/train_jpg/856e74b8c46edcf0c0e23444eab019bfda63687bb70a3481955cc6ab86e39df2.jpg', 'data/competition_files/train_jpg/122d198cf11ab32d2346bff455d6702f1ea519df957cea2625aa50842fe14ad1.jpg', 'data/competition_files/train_jpg/2809fd6afd6d3cae4dd4ad93a7f905a0db32292f4df4b3f19fa5492e08cbfd90.jpg', 'data/competition_files/train_jpg/5ef4a19afe4ad593464931734ff43c1112cf94c6bdb4593f3b754fee46739515.jpg', 'data/competition_files/train_jpg/c37787b5cc6c3052130c6f390aa5b57462b558a204d5c4124bc89447c9e1b4b0.jpg', 'data/competition_files/train_jpg/0f8ae17e177ed82363ed3dba7d277ed6227ac0c935cb52c17f02d638a92aef6e.jpg', 'data/competition_files/train_jpg/ba126be25858022d3cddf07d27288f9d35c495458ec49aa9820708379b7ffc1e.jpg', 'data/competition_files/train_jpg/4cc05cb70bcdde73e34718020f2ef4c69063af4098602bfed8a00e7f53a

__________________________________________________________________________________________________
cat_features_user_id_category_n (None, 1)            0                                            
__________________________________________________________________________________________________
cat_features_user_id_category_n (None, 1)            0                                            
__________________________________________________________________________________________________
cat_features_user_id_category_n (None, 1)            0                                            
__________________________________________________________________________________________________
cat_features_user_id_category_n (None, 1)            0                                            
__________________________________________________________________________________________________
cat_features_user_id_category_n (None, 1)            0                                            
__________

__________________________________________________________________________________________________
user_id_nunique_category_name ( (None, 1)            0                                            
__________________________________________________________________________________________________
user_id_nunique_param_1 (InputL (None, 1)            0                                            
__________________________________________________________________________________________________
user_id_nunique_param_2 (InputL (None, 1)            0                                            
__________________________________________________________________________________________________
user_id_nunique_param_3 (InputL (None, 1)            0                                            
__________________________________________________________________________________________________
user_id_nunique_activation_date (None, 1)            0                                            
__________

__________________________________________________________________________________________________
flatten_13 (Flatten)            (None, 10)           0           embedding_15[0][0]               
__________________________________________________________________________________________________
flatten_14 (Flatten)            (None, 10)           0           embedding_16[0][0]               
__________________________________________________________________________________________________
flatten_15 (Flatten)            (None, 10)           0           embedding_17[0][0]               
__________________________________________________________________________________________________
flatten_16 (Flatten)            (None, 1)            0           embedding_18[0][0]               
__________________________________________________________________________________________________
flatten_17 (Flatten)            (None, 10)           0           embedding_19[0][0]               
__________

                                                                 image_top_1_std_price_pred[0][0] 
                                                                 image_top_1_mean_item_seq_number[
                                                                 user_id_mean_ridge_preds[0][0]   
                                                                 user_id_category_name_mean_ridge_
                                                                 user_id_image_top_1_mean_ridge_pr
                                                                 user_id_category_name_sum_ridge_p
                                                                 region_te[0][0]                  
                                                                 city_te[0][0]                    
                                                                 parent_category_name_te[0][0]    
                                                                 category_name_te[0][0]           
          

HBox(children=(IntProgress(value=0, description='Training', max=8), HTML(value='')))


Epoch 00001: LearningRateScheduler reducing learning rate to 0.001.


HBox(children=(IntProgress(value=0, description='Epoch 0', max=18793), HTML(value='')))


Epoch 00001: val_loss improved from inf to 0.21749, saving model to rnn_weights/model_final_fold_5.hdf5

Epoch 00002: LearningRateScheduler reducing learning rate to 0.001.


HBox(children=(IntProgress(value=0, description='Epoch 1', max=18793), HTML(value='')))


Epoch 00002: val_loss improved from 0.21749 to 0.21631, saving model to rnn_weights/model_final_fold_5.hdf5

Epoch 00003: LearningRateScheduler reducing learning rate to 0.001.


HBox(children=(IntProgress(value=0, description='Epoch 2', max=18793), HTML(value='')))


Epoch 00003: val_loss did not improve from 0.21631

Epoch 00004: LearningRateScheduler reducing learning rate to 0.001.


HBox(children=(IntProgress(value=0, description='Epoch 3', max=18793), HTML(value='')))


Epoch 00004: val_loss improved from 0.21631 to 0.21569, saving model to rnn_weights/model_final_fold_5.hdf5

Epoch 00005: LearningRateScheduler reducing learning rate to 0.001.


HBox(children=(IntProgress(value=0, description='Epoch 4', max=18793), HTML(value='')))


Epoch 00005: val_loss improved from 0.21569 to 0.21527, saving model to rnn_weights/model_final_fold_5.hdf5

Epoch 00006: LearningRateScheduler reducing learning rate to 0.0002.


HBox(children=(IntProgress(value=0, description='Epoch 5', max=18793), HTML(value='')))


Epoch 00006: val_loss improved from 0.21527 to 0.21461, saving model to rnn_weights/model_final_fold_5.hdf5

Epoch 00007: LearningRateScheduler reducing learning rate to 0.0002.


HBox(children=(IntProgress(value=0, description='Epoch 6', max=18793), HTML(value='')))


Epoch 00007: val_loss did not improve from 0.21461

Epoch 00008: LearningRateScheduler reducing learning rate to 0.0002.


HBox(children=(IntProgress(value=0, description='Epoch 7', max=18793), HTML(value='')))


Epoch 00008: val_loss did not improve from 0.21461

file names:
 ['data/competition_files/train_jpg/0b8eed559572527d972b4d959e8f4c107fdd9bc19cca04903854ac315f74615e.jpg', 'data/competition_files/train_jpg/856e74b8c46edcf0c0e23444eab019bfda63687bb70a3481955cc6ab86e39df2.jpg', 'data/competition_files/train_jpg/122d198cf11ab32d2346bff455d6702f1ea519df957cea2625aa50842fe14ad1.jpg', 'data/competition_files/train_jpg/2809fd6afd6d3cae4dd4ad93a7f905a0db32292f4df4b3f19fa5492e08cbfd90.jpg', 'data/competition_files/train_jpg/5ef4a19afe4ad593464931734ff43c1112cf94c6bdb4593f3b754fee46739515.jpg', 'data/competition_files/train_jpg/c37787b5cc6c3052130c6f390aa5b57462b558a204d5c4124bc89447c9e1b4b0.jpg', 'data/competition_files/train_jpg/0f8ae17e177ed82363ed3dba7d277ed6227ac0c935cb52c17f02d638a92aef6e.jpg', 'data/competition_files/train_jpg/ba126be25858022d3cddf07d27288f9d35c495458ec49aa9820708379b7ffc1e.jpg', 'data/competition_files/train_jpg/4cc05cb70bcdde73e34718020f2ef4c69063af4098602bfed8a00e7f53a

In [25]:
# uncomment these to dump files if OOM (out-of-mem) happens
import pickle
pickle.dump(ret, open(resume_file_prefix+'_oof_val_pred', 'wb'))
pickle.dump(ret_test, open(resume_file_prefix+'_oof_test_pred', 'wb'))

In [26]:
# public kernel:  cv = .2220, lb = .2247 
# bigru-conv1d: cv =.2185 , lb = .2235
# bigru-attention: cv =.2186 , lb = .2235
# 2gru: lb: .2239
# self-trained wordvec: cv .217232, lb: .2229
# +partial new features: cv .216326, lb: 
# +all new features: cv .21403, lb: 

### Generate OOFs and Submissions

In [27]:
prefix = 'selftrained_bigru_conv1d_merged_more_features_simpler_network'

In [28]:
pd.DataFrame(data=ret, columns=[prefix+'_rnn_pred']).to_csv(prefix+'_rnn_oof_val_pred.csv', index=False)
pd.DataFrame(data=ret_test, columns=[prefix+'_rnn_pred']).to_csv(prefix+'_rnn_oof_test_pred.csv', index=False)

In [29]:
subm = pd.read_csv('sample_submission.csv')
subm['deal_probability'] = np.clip(ret_test, 0, 1)
subm.to_csv(prefix+'_rnn_submission.csv', index=False)