In [1]:
import tensorflow as tf

In [2]:
import config
import numpy as np
import pandas as pd
import os
import time

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from tensorflow.contrib.layers import batch_norm

In [3]:
def load_features(data_format='train'):
    feature_df = None
    id_field = 'id' if data_format == 'train' else 'test_id'
    for root, dirs, filenames in os.walk(os.path.join(config.FEATURE_DIR, data_format)):
        for filename in filenames:
            if filename.endswith('.csv'):
                df = pd.read_csv(os.path.join(root, filename))
                if feature_df is None:
                    feature_df = df
                else:
                    feature_df = feature_df.merge(df, on=id_field)
    return feature_df

def load_target():
    target_df = pd.read_csv(os.path.join(config.DATA_DIR, 'train.csv'))[['id', 'is_duplicate']]
    return target_df

def log_loss_dup(y_true, y_pred):
    return log_loss(y_true, y_pred[:, 1])

In [4]:
train_features = load_features().drop('id', 1).as_matrix()
train_target = load_target().drop('id', 1).as_matrix()[:, 0]
dev_train_features, dev_test_features, dev_train_target, dev_test_target = train_test_split(
    train_features, train_target, test_size=config.TEST_SIZE, random_state=config.RANDOM_SEED
)

In [5]:
sess = tf.InteractiveSession()

In [6]:
sess.run(tf.constant(1))

1

In [10]:
class NeuralNet:
    # 2 layers hidden unit
    def __init__(self, h=100, epoch=1000, learning_rate=0.5, batch_size=100, activate_f='relu'):
        self.h = h
        self.epoch = epoch
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.activate_f = activate_f
        self.tf_X = None
        self.is_training = None
        self.W1 = None
        self.W2 = None
        self.W3 = None
        self.b1 = None
        self.b2 = None
        self.b3 = None
        self.out = None

    def fit(self, X, y):
        y = self._binarize_labels(y)
        size = X.shape[0]
        np.random.seed(config.RANDOM_SEED)
        index = np.random.permutation(range(size))
        d = X.shape[1]
        c = y.shape[1]

        tf_X = tf.placeholder(tf.float32, [None, d])
        tf_y = tf.placeholder(tf.float32, [None, c])
        is_training = tf.placeholder(tf.bool)
        W1 = tf.Variable(tf.random_normal([d, self.h], seed=config.RANDOM_SEED)/d)
        b1 = tf.Variable(tf.zeros(self.h))
        W2 = tf.Variable(tf.random_normal([self.h, self.h], seed=config.RANDOM_SEED)/self.h)
        b2 = tf.Variable(tf.zeros(self.h))
        W3 = tf.Variable(tf.random_normal([self.h, c], seed=config.RANDOM_SEED)/self.h)
        b3 = tf.Variable(tf.zeros(c))

        z1 = tf.matmul(tf_X, W1) + b1
        z1 = batch_norm(z1, center=True, scale=True, is_training=is_training, updates_collections=None)
        a1 = self.activation_func(z1)
        z2 = tf.matmul(a1, W2) + b2
        z2 = batch_norm(z2, center=True, scale=True, is_training=is_training, updates_collections=None)
        a2 = self.activation_func(z2)
        z3 = tf.matmul(a2, W3) + b3

        cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=z3, labels=tf_y))

        train_step = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(cross_entropy)

        sess.run(tf.global_variables_initializer())
        start_i = 0
        for i in range(self.epoch):
            start_i, batch_index = self._next_batch(start_i, self.batch_size, index)
            start_time = time.time()
            sess.run(train_step, feed_dict={tf_X: X[batch_index], tf_y: y[batch_index], is_training: True})
            if ((i+1) % (self.epoch / 10) == 0):
                print "Epoch %d: %fs. Loss: %f" % (i+1, time.time()-start_time, sess.run(cross_entropy, feed_dict={
                    tf_X: X[batch_index], tf_y: y[batch_index], is_training: False
                }))

        self.W1 = W1
        self.b1 = b1
        self.W2 = W2
        self.b2 = b2
        self.W3 = W3
        self.b3 = b3
        self.out = tf.nn.softmax(z3)
        self.tf_X = tf_X
        self.is_training = is_training

    def predict(self, X):
        return sess.run(self.out, feed_dict={self.tf_X: X, self.is_training: False})

    def activation_func(self, tensor):
        if self.activate_f == 'sigmoid':
            return tf.sigmoid(tensor)
        elif self.activate_f == 'relu':
            return tf.nn.relu(tensor)
        else:
            return tf.sigmoid(tensor)

    def _binarize_labels(self, labels):
        binarized_labels = np.zeros([labels.shape[0], int(np.max(labels) + 1)])
        for i, label in enumerate(labels):
            binarized_labels[i, int(label)] = 1.0
        return binarized_labels
    
    def _next_batch(self, index, batch_size, data):
        size = len(data)
        if index >= size:
            raise Exception('index can not be greater than or equal to data size')
        if batch_size > size:
            raise Exception('batch size can not be greater than data size')

        if (index + batch_size) <= size:
            new_index = index + batch_size
            if new_index == size:
                new_index = 0
            return new_index, data[index:(index+batch_size),]
        else:
            tail_data = data[index:size,]
            remain_batch_size = batch_size - (size - index)
            new_index, remain_data = self._next_batch(0, remain_batch_size, data)
            return new_index, np.concatenate((tail_data, remain_data))

In [11]:
nn = NeuralNet(epoch=50000, batch_size=100, learning_rate=0.5, activate_f='relu')
nn.fit(dev_train_features, dev_train_target)

Epoch 5000: 0.003765s. Loss: 0.443941
Epoch 10000: 0.003929s. Loss: 0.564916
Epoch 15000: 0.004132s. Loss: 0.439606
Epoch 20000: 0.004074s. Loss: 0.397635
Epoch 25000: 0.003282s. Loss: 0.484358
Epoch 30000: 0.004387s. Loss: 0.351625
Epoch 35000: 0.003031s. Loss: 0.455024
Epoch 40000: 0.005947s. Loss: 0.463563
Epoch 45000: 0.003098s. Loss: 0.414384
Epoch 50000: 0.004175s. Loss: 0.464590


In [12]:
log_loss_dup(dev_test_target, nn.predict(dev_test_features))

0.47780829755934562