In [8]:
import time
start_time = time.time()
from sklearn.model_selection import train_test_split
import sys, os, re, csv, codecs, numpy as np, pandas as pd
np.random.seed(32)
# os.environ["OMP_NUM_THREADS"] = "4"
import keras as ks
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, CuDNNGRU
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten, Concatenate
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras import backend as K
from keras.engine import InputSpec, Layer
from keras.optimizers import Adam, RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from keras.layers import GRU, BatchNormalization, Conv1D, MaxPooling1D, LSTM
import tensorflow as tf

from importlib import reload
from sklearn.metrics import mean_squared_error
import datetime

import warnings
warnings.filterwarnings('ignore')

import os

import sys
sys.path.append(os.path.dirname(os.getcwd()))

from project_utils import kd_utils

pd.options.display.float_format = '{:,.8f}'.format
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 1000)

# Load fold split

In [9]:
uid=kd_utils.load_data('../input/indices.pkl')
trn_id=uid[0][0]
val_id=uid[0][1]
print(len(trn_id))
print(len(val_id))

682308
170577


# Load & scale numerical features

In [10]:
train_fs1=kd_utils.load_data('../feature_data/train_fs1.pkl')
train_fs2=kd_utils.load_data('../feature_data/train_fs2.pkl')
train_fs3=kd_utils.load_data('../feature_data/train_fs3.pkl')


test_fs1=kd_utils.load_data('../feature_data/test_fs1.pkl')
test_fs2=kd_utils.load_data('../feature_data/test_fs2.pkl')
test_fs3=kd_utils.load_data('../feature_data/test_fs3.pkl')


train_fs = pd.concat([train_fs1, train_fs2, train_fs3], axis=1)
test_fs = pd.concat([test_fs1, test_fs2, test_fs3], axis=1)

In [11]:
train_fs.columns

Index(['q_dow', 'q_hour', 'a_dow', 'a_hour', 'timediff_a-q', 'q_len', 'a_len',
       'q_count_words', 'a_count_words', 'q_count_unq_words',
       'a_count_unq_words', 'a_dow_min', 'a_dow_max', 'a_dow_max-min',
       'a_dow_mean', 'a_dow_std', 'a_dow_nunique', 'a_hour_min', 'a_hour_max',
       'a_hour_max-min', 'a_hour_mean', 'a_hour_std', 'a_hour_nunique',
       'timediff_a-q_min', 'timediff_a-q_max', 'timediff_a-q_max-min',
       'timediff_a-q_mean', 'timediff_a-q_std', 'timediff_a-q_nunique',
       'a_len_min', 'a_len_max', 'a_len_max-min', 'a_len_mean', 'a_len_std',
       'a_len_nunique', 'a_count_words_min', 'a_count_words_max',
       'a_count_words_max-min', 'a_count_words_mean', 'a_count_words_std',
       'a_count_words_nunique', 'a_count_unq_words_min',
       'a_count_unq_words_max', 'a_count_unq_words_max-min',
       'a_count_unq_words_mean', 'a_count_unq_words_std',
       'a_count_unq_words_nunique', 'qlenchar', 'qlenword', 'alenchar',
       'alenword', 'difflenc

In [12]:
train_fs.fillna(value=0, inplace=True)
test_fs.fillna(value=0, inplace=True)

In [13]:
train_fvec=kd_utils.load_data('../feature_data/train_vec.pkl')
test_fvec=kd_utils.load_data('../feature_data/test_vec.pkl')

In [14]:
from sklearn.preprocessing import MinMaxScaler
mms=MinMaxScaler()
num_feats_value=mms.fit_transform(train_fs)

In [15]:
num_feats_value.shape

(852885, 92)

# load and perform text cleaning

In [16]:
train=pd.read_csv('../input/train.csv', sep='\t')
test=pd.read_csv('../input/test.csv', sep='\t')

train['question_dt']=pd.to_datetime(train['question_utc'], unit='s')
test['question_dt']=pd.to_datetime(test['question_utc'], unit='s')
train['answer_dt']=pd.to_datetime(train['answer_utc'], unit='s')
test['answer_dt']=pd.to_datetime(test['answer_utc'], unit='s')

In [17]:
reload(kd_utils)
with kd_utils.timer('clean question_text'):
    train['question_text']=kd_utils.multi_apply_series(train, feature='question_text', 
                                                          func=kd_utils.preproc_pipeline, n_jobs=4)
    test['question_text']=kd_utils.multi_apply_series(test, feature='question_text', 
                                                          func=kd_utils.preproc_pipeline, n_jobs=4)

[clean question_text] done in 27 s


In [18]:
reload(kd_utils)
with kd_utils.timer('clean answer_text'):
    train['answer_text']=kd_utils.multi_apply_series(train, feature='answer_text', 
                                                          func=kd_utils.preproc_pipeline, n_jobs=4)
    test['answer_text']=kd_utils.multi_apply_series(test, feature='answer_text', 
                                                          func=kd_utils.preproc_pipeline, n_jobs=4)

[clean answer_text] done in 22 s


In [19]:
traintest=train.append(test)

# perform text to sequence

In [20]:
embed_size = 300
max_features = 300000
max_len = 250

In [21]:
with kd_utils.timer("Transforming text data to sequences..."):
    raw_text = np.hstack([traintest['question_text'].str.lower(), traintest['answer_text'].str.lower()])
    print("   Fitting tokenizer...")
    tok_raw = Tokenizer()
    tok_raw.fit_on_texts(raw_text)

    print("   Transforming text to sequences...")
    traintest['seq_qt'] = tok_raw.texts_to_sequences(traintest['question_text'].str.lower())
    traintest['seq_at'] = tok_raw.texts_to_sequences(traintest['answer_text'].str.lower())

   Fitting tokenizer...
   Transforming text to sequences...
[Transforming text data to sequences...] done in 130 s


In [22]:
# kd_utils.pickle_data('../feature_data/tok_raw.pkl',tok_raw)

In [23]:
traintest['seq_qt'].head()

0    [104, 25, 4677, 226, 7905, 678, 4, 162, 383, 33, 6, 92, 22, 162, 694, 35, 2, 288, 6, 61, 1, 991, 9, 911, 120, 77, 19, 1, 69, 155, 4, 60, 47, 455, 288, 116, 7, 7748, 2, 35, 57]
1    [104, 25, 4677, 226, 7905, 678, 4, 162, 383, 33, 6, 92, 22, 162, 694, 35, 2, 288, 6, 61, 1, 991, 9, 911, 120, 77, 19, 1, 69, 155, 4, 60, 47, 455, 288, 116, 7, 7748, 2, 35, 57]
2    [104, 25, 4677, 226, 7905, 678, 4, 162, 383, 33, 6, 92, 22, 162, 694, 35, 2, 288, 6, 61, 1, 991, 9, 911, 120, 77, 19, 1, 69, 155, 4, 60, 47, 455, 288, 116, 7, 7748, 2, 35, 57]
3    [104, 25, 4677, 226, 7905, 678, 4, 162, 383, 33, 6, 92, 22, 162, 694, 35, 2, 288, 6, 61, 1, 991, 9, 911, 120, 77, 19, 1, 69, 155, 4, 60, 47, 455, 288, 116, 7, 7748, 2, 35, 57]
4    [104, 25, 4677, 226, 7905, 678, 4, 162, 383, 33, 6, 92, 22, 162, 694, 35, 2, 288, 6, 61, 1, 991, 9, 911, 120, 77, 19, 1, 69, 155, 4, 60, 47, 455, 288, 116, 7, 7748, 2, 35, 57]
Name: seq_qt, dtype: object

In [24]:
train=traintest[0:len(train)]
test=traintest[len(train):]

# Create embeddding matrix

In [25]:
fasttext_model=kd_utils.load_data('../../common_data/fast_text/fasttext_model.pkl')

In [26]:
word_index = tok_raw.word_index
nb_words = min(max_features, len(word_index))+1
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    if word in fasttext_model.wv.vocab:
        embedding_vector = fasttext_model.wv.word_vec(word)
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
    else:
        try:
            embedding_matrix[i]=fasttext_model.wv.word_vec(word)
#             pass
        except:
            continue

print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Null word embeddings: 21793


# Prepare data for keras model

In [27]:
nb_words

300001

In [28]:
num_feats_value[trn_id].shape

(682308, 92)

In [29]:
def get_keras_data(df, values=None):
    X = {
        'qt': pad_sequences(df.seq_qt, maxlen=max_len),
        'at': pad_sequences(df.seq_at, maxlen=max_len),
#         'brand_name': np.array(df.brand_name),
#         'category_name': pad_sequences(df.seq_category_name, maxlen=MAX_CATEGORY_SEQ),
#         'item_condition': np.array(df.item_condition_id),
        'num_vars': values
#         'num_vars': np.array(df[['shipping', 'sgd_l2_norm','sgd_l1_norm','sgd_l1_l2_norm']]),
    }
    return X

# train = full_df[:n_trains]
# dev = full_df[n_trains:n_trains+n_devs]
# test = full_df[n_trains+n_devs:]

X_train = get_keras_data(train.loc[trn_id], values=num_feats_value[trn_id])
X_val = get_keras_data(train.loc[val_id], values=num_feats_value[val_id])

# num_feats_value

In [30]:
Y_train=np.log1p(train['answer_score'].values[trn_id])
Y_val=np.log1p(train['answer_score'].values[val_id])
# Y_train=train['answer_score'].values[trn_id]
# Y_val=train['answer_score'].values[val_id]

# Build RNN model

In [34]:
def rmsle_K(y, y0):
    return K.sqrt(K.mean(K.square(tf.log1p(y) - tf.log1p(y0))))

def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_true - y_pred), axis=-1)) 

def build_model(lr=0.001, decay=0.0):    
    
    # Inputs
    inp_qt = Input(shape=[X_train["qt"].shape[1]], name="qt")
    inp_at = Input(shape=[X_train["at"].shape[1]], name="at")
    num_vars = Input(shape=[X_train["num_vars"].shape[1]], name="num_vars")
    
    # Embeddings layers
    qt = Embedding(nb_words, embed_size, weights = [embedding_matrix], trainable = False)(inp_qt)
    at = Embedding(nb_words, embed_size, weights = [embedding_matrix], trainable = False)(inp_at)

#     qt = SpatialDropout1D(0.2)(qt)
#     at = SpatialDropout1D(0.2)(at)
#     # rnn layers
#     qt = Bidirectional(CuDNNGRU(128, return_sequences = True))(qt)
#     at = Bidirectional(CuDNNGRU(128, return_sequences = True))(at)
    
    qt_avg=GlobalAveragePooling1D()(qt)
    at_avg=GlobalAveragePooling1D()(at)
    
    qt_mean=GlobalMaxPooling1D()(qt)
    at_mean=GlobalMaxPooling1D()(at)


    # main layers
    main_l = concatenate([qt_avg, qt_mean, at_avg, at_mean, num_vars])
    
    main_l = Dense(64)(main_l)
#     main_l = BatchNormalization()(main_l)
#     main_l = Dropout(0.3)(main_l)
    

    main_l = Dense(32)(main_l)
#     main_l = BatchNormalization()(main_l)
#     main_l = Dropout(0.3)(main_l)
    main_l = Activation('elu')(main_l)    

#     main_l = Dense(32)(main_l)
#     main_l = Activation('elu')(main_l)

    # the output layer.
    output = Dense(1, activation="linear") (main_l)

    model = Model([inp_qt, inp_at, num_vars], output)

    optimizer = Adam(lr=lr, decay=decay, clipvalue=0.5)
#     clipnorm=0.1
#     model.compile(loss=root_mean_squared_error, optimizer = SGD(lr=0.001, momentum=0.9))
    model.compile(loss="mse", optimizer = optimizer)

    return model

model = build_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
qt (InputLayer)                 (None, 250)          0                                            
__________________________________________________________________________________________________
at (InputLayer)                 (None, 250)          0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 250, 300)     90000300    qt[0][0]                         
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 250, 300)     90000300    at[0][0]                         
__________________________________________________________________________________________________
global_ave

In [35]:
lr_init = 1e-3
cv_score = 100
pred_valid_best = None
best_model = None
batch_size = 128
early_stopping_count = 0
patience = 10

model = build_model()

for i in range(100):
#     if i > 15:
#         batch_size += 64
    with kd_utils.timer(f'epoch {i + 1}'):
        ks.backend.set_value(model.optimizer.lr, lr_init/(i+1))
        model.fit(x=X_train, y=Y_train, batch_size=batch_size, epochs=1, verbose=1)
        pred_valid=model.predict(X_val, batch_size=1024)
    
        cv_i = np.sqrt(mean_squared_error(Y_val, pred_valid))
#         cv_i = np.sqrt(mean_squared_error(np.log1p(Y_val), np.log1p(pred_valid)))
        if cv_i < cv_score: 
            cv_score = cv_i
            print('best_score', cv_score, '@', f'epoch {i + 1}')
            pred_valid_best = pred_valid.copy()
            model.save_weights('../models/best_weight.k')
        else:
            early_stopping_count += 1
            print('early stoping is {}'.format(early_stopping_count))
            if early_stopping_count >= patience:
                print('early stopping...')
                break

Epoch 1/1
best_score 0.7865330741950396 @ epoch 1
[epoch 1] done in 17 s
Epoch 1/1
best_score 0.7593918829948966 @ epoch 2
[epoch 2] done in 18 s
Epoch 1/1
best_score 0.7452596410825851 @ epoch 3
[epoch 3] done in 21 s
Epoch 1/1
early stoping is 1
[epoch 4] done in 18 s
Epoch 1/1
best_score 0.7327381773415228 @ epoch 5
[epoch 5] done in 17 s
Epoch 1/1
early stoping is 2
[epoch 6] done in 15 s
Epoch 1/1
early stoping is 3
[epoch 7] done in 18 s
Epoch 1/1
best_score 0.7268934842077696 @ epoch 8
[epoch 8] done in 18 s
Epoch 1/1
early stoping is 4
[epoch 9] done in 15 s
Epoch 1/1
best_score 0.7262885136787588 @ epoch 10
[epoch 10] done in 17 s
Epoch 1/1
early stoping is 5
[epoch 11] done in 16 s
Epoch 1/1
early stoping is 6
[epoch 12] done in 19 s
Epoch 1/1
early stoping is 7
[epoch 13] done in 17 s
Epoch 1/1
best_score 0.7234027908115284 @ epoch 14
[epoch 14] done in 20 s
Epoch 1/1

KeyboardInterrupt: 

In [36]:
pred_valid=model.predict(X_val, batch_size=1024)

In [37]:
np.sqrt(mean_squared_error(Y_val, pred_valid))

1.1257393772113127

In [34]:


print ("Sklearn calculated RMSLE: {}".format(sklearn_rmsle))

Sklearn calculated RMSLE: 0.35702296238557074


In [31]:
mean_squared_error(Y_val, pred_valid)

1.268232639898723