In [1]:
import pandas as pd
import numpy as np
import gc
from tqdm import tqdm

In [2]:
train = pd.read_csv('./train/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70661802 entries, 0 to 70661801
Data columns (total 3 columns):
node1_id    int64
node2_id    int64
is_chat     int64
dtypes: int64(3)
memory usage: 1.6 GB


In [3]:
train = train[train.node1_id!=train.node2_id]
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69758314 entries, 0 to 70661801
Data columns (total 3 columns):
node1_id    int64
node2_id    int64
is_chat     int64
dtypes: int64(3)
memory usage: 2.1 GB


In [4]:
gc.collect()

14

In [5]:
user_features = pd.read_csv('./train/user_features.csv')
user_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8264276 entries, 0 to 8264275
Data columns (total 14 columns):
node_id    int64
f1         int64
f2         int64
f3         int64
f4         int64
f5         int64
f6         int64
f7         int64
f8         int64
f9         int64
f10        int64
f11        int64
f12        int64
f13        int64
dtypes: int64(14)
memory usage: 882.7 MB


In [6]:
base = train.merge(user_features, left_on='node1_id', right_on='node_id', how = 'left')
base = base.merge(user_features, left_on='node2_id', right_on='node_id', how = 'left')
base = base.drop(columns=['node_id_x', 'node_id_y','node1_id','node2_id'])
y = base.is_chat
base = base.drop(columns=['is_chat'])
base.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69758314 entries, 0 to 69758313
Data columns (total 26 columns):
f1_x     int64
f2_x     int64
f3_x     int64
f4_x     int64
f5_x     int64
f6_x     int64
f7_x     int64
f8_x     int64
f9_x     int64
f10_x    int64
f11_x    int64
f12_x    int64
f13_x    int64
f1_y     int64
f2_y     int64
f3_y     int64
f4_y     int64
f5_y     int64
f6_y     int64
f7_y     int64
f8_y     int64
f9_y     int64
f10_y    int64
f11_y    int64
f12_y    int64
f13_y    int64
dtypes: int64(26)
memory usage: 14.0 GB


In [7]:
from sklearn.preprocessing import RobustScaler

sc = RobustScaler()
base = sc.fit_transform(base)
base.shape

(69758314, 26)

In [14]:
from sklearn.externals import joblib 

joblib.dump(sc, 'scaler.pkl') 

['scaler.pkl']

In [8]:
gc.collect()

7

In [9]:
import tensorflow as tf
from keras import backend as K

def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    K.get_session().run(tf.local_variables_initializer())
    return auc

Using TensorFlow backend.


In [10]:
from keras.layers import Dense, subtract, dot, merge, Input, concatenate, Lambda
from keras.models import Model, Sequential

# inputs = Input(shape=[26])
# print(inputs.shape)

input_l = Input(shape=[13], name='input_l')
input_r = Input(shape=[13], name='input_r')
# print(input_l.shape, input_r.shape)

nn_siamese = Sequential()
nn_siamese.add(Dense(13, activation = 'relu', name='siam_1'))
nn_siamese.add(Dense(13, activation = 'relu', name='siam_2'))
nn_siamese.add(Dense(13, activation = 'relu', name='siam_3'))
nn_siamese.add(Dense(13, activation = 'relu', name='siam_4'))

encoded_l = nn_siamese(input_l)
encoded_r = nn_siamese(input_r)

encoded_dot = dot([encoded_l, encoded_r], axes = 1)
encoded_minus = subtract([encoded_l, encoded_r])
encoded_func = Lambda(lambda tensor:K.abs(tensor[0] - tensor[1]))
encoded_f1 = encoded_func([encoded_l, encoded_r])
# print(encoded_dot.shape, encoded_minus.shape, encoded_f1.shape)

encoded_concat = concatenate([encoded_dot, encoded_minus, encoded_f1])
# print(encoded_concat.shape)

dense = Dense(30, activation = 'relu')(encoded_concat)
dense = Dense(30, activation = 'relu')(dense)
dense = Dense(30, activation = 'relu')(dense)
dense = Dense(1, activation = 'sigmoid')(dense)

model = Model(inputs = [input_l, input_r], outputs = dense)
model.summary()

model.compile(loss = 'binary_crossentropy', metrics = [auc], optimizer='adam')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_l (InputLayer)            (None, 13)           0                                            
__________________________________________________________________________________________________
input_r (InputLayer)            (None, 13)           0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 13)           728         input_l[0][0]                    
                                                                 input_r[0][0]                    
__________________________________________________________________________________________________
dot_1 (Dot)                     (None, 1)            0           sequential_1[1][0]               
          

In [11]:
from keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger, ReduceLROnPlateau

mc = ModelCheckpoint('hike_recommender_{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_loss', 
                     verbose=0, save_best_only=True, mode='auto', period=1)

es = EarlyStopping(monitor='val_loss', patience=6, verbose=0, mode='auto', baseline=None, restore_best_weights=True)

log = CSVLogger('training.csv', separator=',', append=True)

rlr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)

cbs = [mc, es, log]

In [12]:
training_history = model.fit([base[:, 0:13], base[:, 13:]], y, epochs=50, 
                             batch_size=256, verbose = 1, validation_split=0.2, callbacks = cbs)

Train on 55806651 samples, validate on 13951663 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
 7996672/55806651 [===>..........................] - ETA: 50:15:45 - loss: 0.1113 - auc: 0.8375

KeyboardInterrupt: 

In [None]:
test = pd.read_csv('./test.csv')