In [1]:
import pandas as pd
import tensorflow as tf
from keras.preprocessing import text, sequence
import numpy as np
from keras.layers import Input, SpatialDropout1D,Dropout, GlobalAveragePooling1D, GRU, Bidirectional, LSTM, Dense, Embedding, concatenate, Embedding, Flatten, Activation, BatchNormalization, regularizers
from keras.initializers import Orthogonal
from keras.models import Model
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, LambdaCallback, Callback, LearningRateScheduler
import keras.backend as K
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
import os
import pickle
import gc; gc.enable()

Using TensorFlow backend.


In [2]:
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
K.tensorflow_backend._get_available_gpus()

[name: "/cpu:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 3905638275003752137
, name: "/gpu:0"
device_type: "GPU"
memory_limit: 104844492
locality {
  bus_id: 1
}
incarnation: 8885604699395045465
physical_device_desc: "device: 0, name: GeForce GTX 1050 Ti, pci bus id: 0000:01:00.0"
]


['/gpu:0']

In [3]:
EMBEDDING_FILE = 'cc.ru.300.vec'
TRAIN_CSV = 'train.csv'
TEST_CSV = 'test.csv'
DENSE_FEATURE_PATH = 'all_features_dense.pickle'
RAW_TEXT_PATH = 'text_feature_space_split_only.pickle'

In [4]:
def load_pickled_data(path):
    with open(path, 'rb') as handle:
        return pickle.load(handle)
    return None

In [5]:
max_features = 100000
maxlen = 150
embed_size = 300

print('loading data')
train = pd.read_csv(TRAIN_CSV, parse_dates=['activation_date'])
train = train.sort_values('activation_date').reset_index(drop=True)
labels = train['deal_probability'].values
train_len = len(labels)
del train; gc.collect()
dense_features = load_pickled_data(DENSE_FEATURE_PATH); 
texts = list(load_pickled_data(RAW_TEXT_PATH))

print('fitting tokenizer')
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(texts)

print('getting embeddings')
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding = 'utf8'))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

print('setup max info')
max_info = dict((col, dense_features[col].max()) for col in dense_features.columns)

print('train-test-val split')
dense_train = dense_features.loc[:train_len-1,:]; del dense_features; gc.collect()
texts_train = texts[:train_len]; del texts; gc.collect()
dense_val = dense_train.loc[int(train_len*.7):,].reset_index(drop=True)
dense_train = dense_train.loc[:int(train_len*.7)-1,].reset_index(drop=True)
texts_val = texts_train[int(train_len*.7):]
texts_train = texts_train[:int(train_len*.7)]
y_val = labels[int(train_len*.7):]
y_train = labels[:int(train_len*.7)]
print('Sample num sanity test (should be equal)', dense_train.shape[0], len(y_train), len(texts_train))
print('Sample num sanity test (should be equal)', dense_val.shape[0], len(y_val), len(texts_val))
print(type(y_train), type(texts_train))
#X_train, X_valid, y_train, y_valid = train_test_split(train['description'].values, labels['deal_probability'].values, test_size = 0.1, random_state = 23)

print('convert to sequences')
texts_train = tokenizer.texts_to_sequences(texts_train)
texts_val = tokenizer.texts_to_sequences(texts_val)

print('padding')
texts_train = sequence.pad_sequences(texts_train, maxlen=maxlen)
texts_val = sequence.pad_sequences(texts_val, maxlen=maxlen)

def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(K.clip(y_pred, 0., 1.) - y_true), axis=-1))
print(dense_train.columns.tolist())

X_train = dict((col, dense_train[col].values) for col in dense_train.columns)
X_train['text'] = texts_train
X_val = dict((col, dense_val[col].values) for col in dense_val.columns)
X_val['text'] = texts_val

cat_features_to_embed = ['user_id', 'user_type', 'region', 'city', 'parent_category_name', 'category_name', 
                        'param_1', 'param_2', 'param_3', 'item_seq_number',  'image_top_1',  'region_city', 
                        'parent_category_name_category_name', 'parent_category_name_param_1', 'parent_category_name_param_2', 'parent_category_name_param_3', 
                        'category_name_param_1', 'category_name_param_2', 'category_name_param_3', 'parent_category_name_region', 'category_name_region', 
                        #'parent_category_name_city', 'category_name_city', 'parent_category_name_image_top_1', 'category_name_image_top_1', 
                        'Weekday'] #, 'dom']
                        
other_features = ['price', 'region_in_title', 'region_in_title_counts', 'city_in_title', 'city_in_title_counts', 
                'parent_category_name_in_title', 'parent_category_name_in_title_counts', 'category_name_in_title', 'category_name_in_title_counts', 
                'region_in_description', 'region_in_description_counts', 'city_in_description', 'city_in_description_counts', 
                'parent_category_name_in_description', 'parent_category_name_in_description_counts', 'category_name_in_description', 'category_name_in_description_counts', 
                'title_in_description', 'title_in_description_counts', 'desc_char_count', 'space_count', 'surprise_count', 'question_count', 'quote_count', 'quote_count2',]

loading data
fitting tokenizer
getting embeddings
setup max info
train-test-val split
Sample num sanity test (should be equal) 1052396 1052396 1052396
Sample num sanity test (should be equal) 451028 451028 451028
<class 'numpy.ndarray'> <class 'list'>
convert to sequences
padding
['user_id', 'region', 'city', 'parent_category_name', 'category_name', 'param_1', 'param_2', 'param_3', 'price', 'item_seq_number', 'user_type', 'image_top_1', 'same_activate_cnt', 'duration_1_mean', 'duration_1_med', 'duration_1_min', 'duration_1_max', 'duration_1_std', 'duration_2_mean', 'duration_2_med', 'duration_2_min', 'duration_2_max', 'duration_2_std', 'duration_3_mean', 'duration_3_med', 'duration_3_min', 'duration_3_max', 'duration_3_std', 'renewed_count_mean', 'renewed_count_med', 'renewed_count_min', 'renewed_count_max', 'renewed_count_std', 'is_renewed_mean', 'is_renewed_med', 'is_renewed_std', 'param_1_is_na', 'param_2_is_na', 'param_3_is_na', 'description_is_na', 'price_is_na', 'image_top_1_is_n

In [6]:
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [7]:
def build_model(X):
    
    # non-cat features
    non_cat_inputs = []
    for col in other_features:
        f = Input(shape=[1], name=col)
        non_cat_inputs.append(f)
        
    # cat features
    cat_inputs = []
    cat_embeds = []
    for col in cat_features_to_embed:
        f = Input(shape=[1], name=col)
        embed_dim = max_info[col].max()+1
        reduced_dim = max(2, embed_dim//4)
        embed_f = Embedding(embed_dim, reduced_dim)(f)
        flatten_f = Flatten()(embed_f)
        cat_inputs.append(f)
        cat_embeds.append(flatten_f)
      
    # text features
    text_inp = Input(shape = (maxlen, ), name='text')
    '''
    text_emb = Embedding(nb_words, embed_size, weights = [embedding_matrix],
                    input_length = maxlen, trainable = False)(text_inp)
    text_emb = SpatialDropout1D(0.1)(text_emb)
    text_gru = Bidirectional(GRU(32,return_sequences = True))(text_emb)
    text_gru = GlobalAveragePooling1D()(text_gru)
    text_gru = Dropout(0.1)(text_gru)
    '''
    text_emb = Embedding(nb_words, embed_size, weights=[embedding_matrix],
                  trainable=False)(text_inp)
    text_emb = Bidirectional(LSTM(128, return_sequences=True, dropout=0.25,
                           recurrent_dropout=0.25))(text_emb)
    text_gru = Attention(maxlen)(text_emb)
    
    concat_main = non_cat_inputs+cat_embeds+[text_gru]
    main = concatenate(concat_main)
    main = BatchNormalization()(main)
    main = BatchNormalization()(Dense(256, activation='relu')(main))
    main = BatchNormalization()(Dense(128, activation='relu')(main))
    main = BatchNormalization()(Dense(64, activation='relu')(main))
    out = Dense(1, activation = "linear")(main)

    concat_input = non_cat_inputs+cat_inputs+[text_inp]
    model = Model(concat_input, out)
    model.regularizers = [regularizers.l2(0.0001)]
    model.compile(optimizer = Adam(lr=0.0005), loss = 'mean_squared_error',
                  metrics =[root_mean_squared_error])
    #model.summary()
    return model

def clip_rmse(true, prediction):
    return np.sqrt(metrics.mean_squared_error(true, np.clip(prediction, 0., 1.)))
    
class NBatchEvalLogger(Callback):
    def __init__(self, display, val_X, val_y, save_path=None, save_start=1000):
        self.step = 0
        self.display = display
        self.val_X = val_X
        self.val_y = val_y
        self.best_loss = None
        self.save_path = save_path
        self.save_start = save_start
        self.record_count = 0
        
    def on_batch_end(self, batch, logs={}):
        self.step += 1
        if self.step % self.display == 0 and self.step >= self.save_start:
            #loss, metric = self.model.evaluate(self.val_X, self.val_y, batch_size=128, verbose=1)
            prediction = self.model.predict(self.val_X, batch_size=128, verbose=0)
            loss = clip_rmse(self.val_y, prediction)
            
            if self.best_loss is None:
                self.best_loss = loss
            else:
                if loss < self.best_loss:
                    self.best_loss = loss
                    if self.save_path is not None:
                        self.model.save(self.save_path.replace('model','model_'+str(self.record_count)), overwrite=True)
                        self.record_count += 1
                    
            print('\nstep: {} val loss={:.5f}, best loss={:.5f}'.format(self.step, loss, self.best_loss))

In [8]:
EPOCHS = 8
model = build_model(X_train)
file_path = "rnn_weights/model.hdf5"
  
#check_point = ModelCheckpoint(file_path, monitor = "val_loss", mode = "min", save_best_only = True, verbose = 1)
lr_schd = LearningRateScheduler(lambda epoch: 0.001*(0.1**(epoch//4)), verbose=1)
check_point = NBatchEvalLogger(1000, X_val, y_val, save_path=file_path, save_start=2000)
history = model.fit(X_train, y_train, batch_size = 128, epochs = EPOCHS, validation_data = (X_val, y_val),
                verbose = 2, callbacks = [lr_schd, check_point])

Train on 1052396 samples, validate on 451028 samples
Epoch 1/8

Epoch 00001: LearningRateScheduler reducing learning rate to 0.001.

step: 2000 val loss=0.23400, best loss=0.23400

step: 3000 val loss=0.23245, best loss=0.23245

step: 4000 val loss=0.23193, best loss=0.23193

step: 5000 val loss=0.23577, best loss=0.23193

step: 6000 val loss=0.23270, best loss=0.23193

step: 7000 val loss=0.23096, best loss=0.23096

step: 8000 val loss=0.23227, best loss=0.23096
 - 7201s - loss: 0.0577 - root_mean_squared_error: 0.1595 - val_loss: 0.0532 - val_root_mean_squared_error: 0.1494
Epoch 2/8

Epoch 00002: LearningRateScheduler reducing learning rate to 0.001.

step: 9000 val loss=0.23053, best loss=0.23053

step: 10000 val loss=0.23463, best loss=0.23053

step: 11000 val loss=0.23030, best loss=0.23030

step: 12000 val loss=0.22970, best loss=0.22970

step: 13000 val loss=0.22934, best loss=0.22934

step: 14000 val loss=0.22879, best loss=0.22879

step: 15000 val loss=0.23082, best loss=0.22

In [None]:
max_bg_count = 5
bagging_count = 5

In [None]:
val_df = pd.DataFrame()
for i in range(max_bg_count-bagging_count, max_bg_count, 1):
    model.load_weights(file_path.replace('model', 'model_'+str(i)))
    prediction = model.predict(X_val)
    print('RMSE:', np.sqrt(metrics.mean_squared_error(y_val, np.clip(prediction, 0., 1.))))
    val_df.loc[:,'rnn_pred_'+str(i)] = prediction
    
val_df.to_csv('rnn_val_pred.csv', index=False)

In [None]:
'''
Generate submissions
'''
del X_train, X_val, dense_train, dense_val; gc.collect()
dense_features = load_pickled_data(DENSE_FEATURE_PATH)
texts = list(load_pickled_data(RAW_TEXT_PATH))

print('train-test-val split')
dense_test = dense_features.loc[train_len:,:].reset_index(drop=True); del dense_features; gc.collect()
texts_test = texts[train_len:]; del texts; gc.collect()

print('convert to sequences')
texts_test = tokenizer.texts_to_sequences(texts_test)

print('padding')
texts_test = sequence.pad_sequences(texts_test, maxlen=maxlen)

X_test = dict((col, dense_test[col].values) for col in dense_train.columns)
X_test['text'] = texts_test

In [None]:
model.load_weights(file_path.replace('model', 'model_'+str(max_bg_count-1)))
prediction = model.predict(X_test, batch_size = 128, verbose = 1)

sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)
submission = sample_submission.copy()
submission['deal_probability'] = np.clip(prediction, 0., 1.)
submission.to_csv('rnn_submission.csv')
pd.DataFrame(data=prediction, columns=['rnn_pred']).to_csv('rnn_test_pred.csv', index=False)

In [None]:
test_df = pd.DataFrame()
for i in range(max_bg_count-bagging_count, max_bg_count, 1):
    model.load_weights(file_path.replace('model', 'model_'+str(i)))
    prediction = model.predict(X_val)
    print('RMSE:', np.sqrt(metrics.mean_squared_error(y_val, np.clip(prediction, 0., 1.))))
    test_df.loc[:,'rnn_pred_'+str(i)] = prediction
    
test_df.to_csv('rnn_test_pred.csv', index=False)