In [0]:
# Install a Drive FUSE wrapper.
# https://github.com/astrada/google-drive-ocamlfuse
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse

# Generate auth tokens for Colab
from google.colab import auth
auth.authenticate_user()

# Generate creds for the Drive FUSE library.
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

!mkdir -p drive
!google-drive-ocamlfuse drive

In [0]:
ls drive/tmp/data

!apt-get -qq install --no-install-recommends nvidia-375
!apt-get -qq install --no-install-recommends nvidia-opencl-icd-375 nvidia-opencl-dev opencl-headers
!apt-get -qq install --no-install-recommends git cmake build-essential libboost-dev libboost-system-dev libboost-filesystem-dev
!pip3 install -qq lightgbm --install-option=--gpu

# Coding Log
---
## Start with Linear Regression 
    - variable - region, parent_category_name, category_name, user_type >> 0.2472 

## Try LightGBM
    - variable - region, parent_category_name, category_name, user_type >> 0.2412 
    
#### Hard to keep all matrix due to memory limitation
#### So.....

## Change Model to NN (feat. Keras)
|  Model        | Variables                            | Val_loss  | LB |
| ------------- |:-------------------------------- | -----:| -----:|
|||||
| Linear Reg | region, pcn, cn, ut  | Unknown | 0.2472 |
| LightGBM | region, pcn, cn, ut  | 0.2351 | 0.2412 |
|||||
| NN      | region | 0.2588 | 0.3032 | 
| NN      | region, pcn      | 0.2475 | 0.2537 |
| NN      | region, pcn, cn    |  0.2430 |  0.2488 |
| NN      | region, pcn, cn, ut    | 0.2407 |  0.2459 | 
| NN      | region, pcn, cn, ut, city   |   0.2402 |  Unkonwn |
| NN      | region, pcn, cn, ut, city, price  |    0.2370  | Unkonwn |
| NN      | region, pcn, cn, ut, city, price, p1, p2, p3  |    0.2268  | 0.2320 |
| NN      | region, pcn, cn, ut, city, price, p1, p2, p3, itemseq  |    0.2255  | 0.2304 |

***
***

# Import  basic

In [0]:
import numpy as np
import pandas as pd
import gc
import os
import pickle

# Data Load

In [0]:
df_x_train = pd.read_csv('drive/tmp/data/train.csv')
df_test = pd.read_csv('drive/tmp/data/test.csv')

df_y_train = df_x_train['deal_probability']
df_x_train = df_x_train.drop(['deal_probability'], axis=1)

In [0]:
df_x_train.activation_date.unique()

In [0]:
df_x_train.info()

In [0]:
df_test.info()

In [0]:
# viewing # of unique value in each column 
for col in df_x_train.columns:
    print(col, len(df_x_train[col].unique()))

## Variable Plan

    item_id 1503424              => drop
    user_id 771769               => drop
    region 28                    => (added)categorical embedding
    city 1733                    => (added)categorical embedding
    parent_category_name 9       => (added)categorical embedding
    category_name 47             => (added)categorical embedding
    param_1 372                  => (added)categorical embedding
    param_2 272                  => (added)categorical embedding
    param_3 1220                 => (added)categorical embedding 
    title 788377                 => deal with nlp
    description 1317103          => (added)word embedding, LSTM
    price 17007                  => (added)continuous, log
    item_seq_number 28232        => continuous, log
    activation_date 21           => (added)weekday, categorical
    user_type 3                  => (added)categorical embedding
    image 1390837                => image_feature (VGG, Resnet, inception) ***
    image_top_1 3063             => (added)categorical

# Fill NA

In [0]:
df_x_train['image_top_1'].fillna(value=3067, inplace=True)
df_test['image_top_1'].fillna(value=3067, inplace=True)

In [0]:
df_x_train['param_1'].fillna(value='_NA_', inplace=True)
df_test['param_1'].fillna(value='_NA_', inplace=True)

df_x_train['param_2'].fillna(value='_NA_', inplace=True)
df_test['param_2'].fillna(value='_NA_', inplace=True)

df_x_train['param_3'].fillna(value='_NA_', inplace=True)
df_test['param_3'].fillna(value='_NA_', inplace=True)

In [0]:
df_x_train['title'].fillna(value='_NA_', inplace=True)
df_test['title'].fillna(value='_NA_', inplace=True)
df_x_train['description'].fillna(value='_NA_', inplace=True)
df_test['description'].fillna(value='_NA_', inplace=True)

# Preprocess

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
import argparse

#create config init
config = argparse.Namespace()

### Categorical Variables - region, pcn, cn, ut, city, p1, p2, p3, week, imgt1

In [0]:
def tknzr_fit(col, df_trn, df_test):
    tknzr = Tokenizer(filters='', lower=False, split='뷁' )
    tknzr.fit_on_texts(df_trn[col])
    tknzr.fit_on_texts(df_test[col])
    return np.array(tknzr.texts_to_sequences(df_trn[col])), np.array(tknzr.texts_to_sequences(df_test[col])), tknzr

In [0]:
tr_reg, te_reg, tknzr_reg = tknzr_fit('region', df_x_train, df_test)
tr_pcn, te_pcn, tknzr_pcn = tknzr_fit('parent_category_name', df_x_train, df_test)
tr_cn, te_cn, tknzr_cn = tknzr_fit('category_name', df_x_train, df_test)
tr_ut, te_ut, tknzr_ut = tknzr_fit('user_type', df_x_train, df_test)
tr_city, te_city, tknzr_city = tknzr_fit('city', df_x_train, df_test)

tr_p1, te_p1, tknzr_p1 = tknzr_fit('param_1', df_x_train, df_test)
tr_p2, te_p2, tknzr_p2 = tknzr_fit('param_2', df_x_train, df_test)
tr_p3, te_p3, tknzr_p3 = tknzr_fit('param_3', df_x_train, df_test)

In [0]:
tr_week = pd.to_datetime(df_x_train['activation_date']).dt.weekday.astype(np.int32).values
te_week = pd.to_datetime(df_test['activation_date']).dt.weekday.astype(np.int32).values
tr_day = pd.to_datetime(df_x_train['activation_date']).dt.day.astype(np.int32).values
te_day = pd.to_datetime(df_test['activation_date']).dt.day.astype(np.int32).values

tr_week = np.expand_dims(tr_week, axis=-1)
te_week = np.expand_dims(te_week, axis=-1)
tr_day = np.expand_dims(tr_day, axis=-1)
te_day = np.expand_dims(te_day, axis=-1)

In [0]:
tr_imgt1 = df_x_train['image_top_1'].astype(np.int32).values
te_imgt1 = df_test['image_top_1'].astype(np.int32).values
tr_imgt1 = np.expand_dims(tr_imgt1, axis=-1)
te_imgt1 = np.expand_dims(te_imgt1, axis=-1)

### Continuous Variables

In [0]:
eps = 1e-10
tr_price = np.log(df_x_train['price']+eps)
te_price = np.log(df_test['price']+eps)
tr_price[np.isnan(tr_price)] = -1.
te_price[np.isnan(te_price)] = -1.

tr_price = np.expand_dims(tr_price, axis=-1)
te_price = np.expand_dims(te_price, axis=-1)

In [0]:
tr_itemseq = np.log(df_x_train['item_seq_number'])
te_itemseq = np.log(df_test['item_seq_number'])
# price_tr[price_tr.isna()] = -1.
# price_te[price_te.isna()] = -1.

tr_itemseq = np.expand_dims(tr_itemseq, axis=-1)
te_itemseq = np.expand_dims(te_itemseq, axis=-1)

## Text Variable

## keras_Tokenizer

In [0]:
config.len_title = 50
config.len_desc = 7500

In [0]:
from keras.preprocessing.sequence import pad_sequences

In [0]:
tknzr_title = Tokenizer(num_words=config.len_title, lower='True')
tknzr_desc = Tokenizer(num_words=config.len_desc, lower='True')

In [0]:
tknzr_title.fit_on_texts(df_x_train['title'].values)
tknzr_desc.fit_on_texts(df_x_train['description'].values)

In [0]:
tr_title_seq = tknzr_title.texts_to_sequences(df_x_train['title'].values)
te_title_seq = tknzr_title.texts_to_sequences(df_test['title'].values)

tr_desc_seq = tknzr_desc.texts_to_sequences(df_x_train['description'].values)
te_desc_seq = tknzr_desc.texts_to_sequences(df_test['description'].values)

In [0]:
config.maxlen= 100

In [0]:
tr_title_pad = pad_sequences(tr_title_seq, maxlen=15)
te_title_pad = pad_sequences(te_title_seq, maxlen=15)

tr_desc_pad = pad_sequences(tr_desc_seq, maxlen=75)
te_desc_pad = pad_sequences(te_desc_seq, maxlen=75)

In [0]:
gc.collect()

### Configuration

In [0]:
## categorical
config.len_reg = len(tknzr_reg.word_index)+1
config.len_pcn = len(tknzr_pcn.word_index)+1
config.len_cn = len(tknzr_cn.word_index)+1
config.len_ut = len(tknzr_ut.word_index)+1
config.len_city = len(tknzr_city.word_index)+1
config.len_week = 7
config.len_day = 31
config.len_imgt1 = int(df_x_train['image_top_1'].max())+1
config.len_p1 = len(tknzr_p1.word_index)+1
config.len_p2 = len(tknzr_p2.word_index)+1
config.len_p3 = len(tknzr_p3.word_index)+1

## continuous
config.len_price = 1
config.len_itemseq = 1

#text
#config.len_desc = len(tknzr_desc.word_index)

In [0]:
## categorical
config.emb_reg = 8
config.emb_pcn = 4
config.emb_cn = 8
config.emb_ut = 2
config.emb_city = 16
config.emb_week = 8
config.emb_day = 8
config.emb_imgt1 = 16
config.emb_p1 = 8
config.emb_p2 = 16
config.emb_p3 = 16

#continuous
config.emb_price = 16
config.emb_itemseq = 16

#text
config.emb_title = 10
config.emb_desc = 100

In [0]:
df_y_train_ = df_y_train.sample(frac=1, random_state=1991)

valid_idx = df_y_train_.sample(frac=0.1, random_state=1991).index
train_idx = df_y_train_[np.invert(df_y_train_.index.isin(valid_idx))].index

In [0]:
train_idx = np.loadtxt('drive/tmp/data/trainIndex.txt', dtype=int)
valid_idx = np.loadtxt('drive/tmp/data/validIndex.txt', dtype=int)

In [0]:
rest_idx = np.loadtxt('drive/tmp/data/restIndex.txt', dtype=int)

In [0]:
ensenble = df_y_train[np.invert(df_y_train.index.isin(train_idx))]
ensenble_index = ensenble[np.invert(ensenble.index.isin(valid_idx))].index

In [0]:
print("total len = {}".format(len(df_x_train)))

print("valid len = {}".format(len(valid_idx)))
print("train len = {}".format(len(train_idx)))
print("rest len = {}".format(len(ensenble_index)))
print("combine len = {}".format(len(valid_idx) + len(train_idx) + len(ensenble_index)))

np.savetxt('drive/tmp/data/restIndex.txt', ensenble_index, fmt="%d") 

In [0]:
X = np.array([tr_reg, tr_pcn, tr_cn, tr_ut, tr_city, tr_week, tr_day, tr_imgt1, tr_p1, tr_p2, tr_p3,
              tr_price, tr_itemseq])
X_test = np.array([te_reg, te_pcn, te_cn, te_ut, te_city, te_week, te_day, te_imgt1, te_p1, te_p2, te_p3,
                   te_price, te_itemseq])
Y = df_y_train

In [0]:
X_train = [x[rest_idx] for x in X]
Y_train = Y[rest_idx]

X_train.append(tr_title_pad[rest_idx])
X_train.append(tr_desc_pad[rest_idx])

In [0]:
X_train = [x[train_idx] for x in X]
X_valid = [x[valid_idx] for x in X]
X_test = [x for x in X_test]

Y_train = Y[train_idx]
Y_valid = Y[valid_idx]

In [0]:
X_train.append(tr_title_pad[train_idx])
X_train.append(tr_desc_pad[train_idx])

X_valid.append(tr_title_pad[valid_idx])
X_valid.append(tr_desc_pad[valid_idx])

X_test.append(te_title_pad)
X_test.append(te_desc_pad)

In [0]:
with open("drive/tmp/data/trainTmp.txt", "wb") as fp:
     pickle.dump(X_train, fp)

with open("drive/tmp/data/testTmp.txt", "wb") as fp:
     pickle.dump(X_test, fp)
    
with open("drive/tmp/data/validTmp.txt", "wb") as fp:
     pickle.dump(X_valid, fp)

# Check Point

In [0]:
import numpy as np
import pandas as pd
import gc
import os
import pickle
import itertools

In [0]:
with open("drive/tmp/data/trainTmp.txt", "rb") as fp:
     X_train = pickle.load(fp)
    
with open("drive/tmp/data/validTmp.txt", "rb") as fp:
     X_valid = pickle.load(fp)
    
with open("drive/tmp/data/testTmp.txt", "rb") as fp:
     X_test = pickle.load(fp)
    
with open("drive/tmp/data/trainY.txt", "rb") as fp:
     Y_train = pickle.load(fp)
    
with open("drive/tmp/data/validY.txt", "rb") as fp:
     Y_valid = pickle.load(fp)

In [0]:
gc.collect()

# Keras Model & RMSE Loss

In [0]:
from keras.layers import Input, Embedding, Dense, Dropout, BatchNormalization, Activation ,regularizers
from keras.layers import GlobalMaxPool1D, GlobalAvgPool1D, GlobalMaxPool2D, MaxPool1D, Convolution1D, Flatten
from keras.layers import concatenate
from keras.layers import LSTM, GRU, CuDNNLSTM, CuDNNGRU, Bidirectional
from keras.initializers import Constant
from keras.models import Model

from keras import backend as K

from keras.optimizers import RMSprop, Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping

### rmse loss for keras
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true))) 

In [0]:
config.batch_size = 128

## ANN model

In [0]:
def get_model():
    K.clear_session()
    
    ##### Categorical Variable #####
    inp_reg = Input(shape=(1, ), name='inp_region')
    emb_reg = Embedding(config.len_reg, config.emb_reg, name='emb_region')(inp_reg)
    
    inp_pcn = Input(shape=(1, ), name='inp_parent_category_name')
    emb_pcn = Embedding(config.len_pcn, config.emb_pcn, name='emb_parent_category_name')(inp_pcn)

    inp_cn = Input(shape=(1, ), name='inp_category_name')
    emb_cn = Embedding(config.len_cn, config.emb_cn, name="emb_category_name" )(inp_cn)
    
    inp_ut = Input(shape=(1, ), name='inp_user_type')
    emb_ut = Embedding(config.len_ut, config.emb_ut, name='emb_user_type' )(inp_ut)
    
    inp_city = Input(shape=(1, ), name='inp_city')
    emb_city = Embedding(config.len_city, config.emb_city, name='emb_city' )(inp_city)

    inp_week = Input(shape=(1, ), name='inp_week')
    emb_week = Embedding(config.len_week, config.emb_week, name='emb_week' )(inp_week)
    
    inp_day = Input(shape=(1, ), name='inp_day')
    emb_day = Embedding(config.len_day, config.emb_day, name='emb_day' )(inp_day)

    inp_imgt1 = Input(shape=(1, ), name='inp_imgt1')
    emb_imgt1 = Embedding(config.len_imgt1, config.emb_imgt1, name='emb_imgt1')(inp_imgt1)
    
    inp_p1 = Input(shape=(1, ), name='inp_p1')
    emb_p1 = Embedding(config.len_p1, config.emb_p1, name='emb_p1')(inp_p1)
    
    inp_p2 = Input(shape=(1, ), name='inp_p2')
    emb_p2 = Embedding(config.len_p2, config.emb_p2, name='emb_p2')(inp_p2)
    
    inp_p3 = Input(shape=(1, ), name='inp_p3')
    emb_p3 = Embedding(config.len_p3, config.emb_p3, name='emb_p3')(inp_p3)
    
    conc_cate = concatenate([emb_reg, emb_pcn,  emb_cn, emb_ut, emb_city, emb_week, emb_day, emb_imgt1, emb_p1, emb_p2, emb_p3], axis=-1, name='concat_categorcal_vars')
    #conc_cate = BatchNormalization()(conc_cate)
    conc_cate = GlobalMaxPool1D()(conc_cate)
    
    
    ##### Continuous Variable #####
    inp_price = Input(shape=(1, ), name='inp_price')
    emb_price = Dense(config.emb_price, activation='tanh', name='emb_price')(inp_price)

    inp_itemseq = Input(shape=(1, ), name='inp_itemseq')
    emb_itemseq = Dense(config.emb_itemseq, activation='tanh', name='emb_itemseq')(inp_itemseq)
    
    conc_cont = concatenate([conc_cate, emb_price, emb_itemseq], axis=-1)
    
    x = BatchNormalization()(conc_cont)
    x = Dense(142, kernel_initializer='he_normal', activation='elu')(x)
    x = Dense(142, kernel_initializer='he_normal', activation='elu')(x)
    

    ##### Text #####
    inp_title = Input(shape=(15, ), name='inp_title')
    emb_title = Embedding(config.len_title, config.emb_title, name='emb_title')(inp_title)
    title_layer = Bidirectional( CuDNNGRU(50, return_sequences=False) )(emb_title)
    
    inp_desc = Input(shape=(75, ), name='inp_desc')
    emb_desc = Embedding(config.len_desc, config.emb_desc, name='emb_desc')(inp_desc)
    desc_layer = Bidirectional( CuDNNGRU(100, return_sequences=False) )(emb_desc)
    
    conc_desc = concatenate([x, title_layer, desc_layer], axis=-1)
    
    outp = Dense(1, activation='sigmoid', name='output')(conc_desc)
    
    model = Model(inputs = [inp_reg, inp_pcn, inp_cn, inp_ut, inp_city, inp_week, inp_day, inp_imgt1, inp_p1, inp_p2, inp_p3,
                            inp_price, inp_itemseq, inp_title, inp_desc], outputs = outp)
    return model

In [0]:
model = get_model()
model.compile(optimizer=Adam(lr=0.001, decay=0.0001), loss = root_mean_squared_error, metrics=['mse', root_mean_squared_error])
#model.compile(optimizer=RMSprop(lr=0.0005, decay=0.00001), loss = root_mean_squared_error, metrics=['mse', root_mean_squared_error])
model.summary()

In [0]:
### callbacks
checkpoint = ModelCheckpoint('drive/tmp/model/best_nn_p1.hdf5', monitor='val_loss', verbose=1, save_best_only=True)
early = EarlyStopping(patience=10, mode='min')

In [0]:
model.fit(x=X_train, y=np.array(Y_train), validation_data=(X_valid, Y_valid), batch_size=config.batch_size, epochs=50, callbacks=[checkpoint,early], verbose=1)

## LightGBM

In [0]:
d_train = lgb.Dataset(np.array(train_X), label=np.array(Y_train))
d_valid = lgb.Dataset(np.array(valid_X), label=np.array(Y_valid))

params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'rmse',
'metric': {'l1', 'l2', 'rmse'},
'is_training_metric': True,
'metric_freq': 5,
'num_leaves': 300,
'max_depth': 10,
'min_data_in_leaf': 20,
'learning_rate': 0.01,
'feature_fraction': 0.5,
'bagging_fraction': 0.75,
'bagging_freq': 1,
'bagging_seed': 3,
'verbose': 1,
}


clf = lgb.train(params, d_train, num_boost_round=5000, valid_sets=d_valid, early_stopping_rounds=30)

In [0]:
def my_custom_loss_func(ground_truth, predictions):
    diff = np.sqrt(np.mean(np.square(ground_truth - predictions)))
    return diff

y_pred=clf.predict(valid_X)

print("loss: {}".format(my_custom_loss_func(y_pred,np.array(Y_valid))))

# Model Visualization

In [0]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

SVG(model_to_dot(model).create(prog='dot', format='svg'))

# Test & submit

In [0]:
model.load_weights('drive/tmp/model/best_bi_nlp.hdf5')
pred = model.predict(X_test)

subm = pd.read_csv("drive/tmp/data/sample_submission.csv")
subm['deal_probability'] = pred
subm.to_csv('drive/tmp/data/submit_{}_{:.4f}.csv'.format('nn_p3', 0.2214), index=False)