In [1]:
import os, io
import numpy as np
import pandas as pd
from PIL import Image
import bson
import time

import tensorflow as tf

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
def make_category_tables():
    cat2idx = {}
    idx2cat = {}
    for ir in categories_df.itertuples():
        category_id = ir[0]
        category_idx = ir[4]
        cat2idx[category_id] = category_idx
        idx2cat[category_idx] = category_id
    return cat2idx, idx2cat

categories_df = pd.read_csv("categories.csv", index_col=0)
cat2idx, idx2cat = make_category_tables()

train_offsets_df = pd.read_csv("train_offsets.csv", index_col=0)
train_images_df = pd.read_csv("train_images.csv", index_col=0)
val_images_df = pd.read_csv("val_images.csv", index_col=0)

test_offsets_df = pd.read_csv("test_offsets.csv", index_col=0)
test_images_df = pd.read_csv("test_images.csv", index_col=0)

num_train_images = len(train_images_df)
num_val_images = len(val_images_df)
num_test_images = len(test_images_df)
print('num_train_images = %d, num_val_images = %d, num_test_images = %d' % (num_train_images, num_val_images, num_test_images))

data_dir = "/mnt/data/cdiscount/"

train_bson_path = os.path.join(data_dir, "train.bson")
test_bson_path = os.path.join(data_dir, "test.bson")

train_bson_file = open(train_bson_path, "rb")
test_bson_file = open(test_bson_path, "rb")

batch_size = 512

  mask |= (ar1 == a)


num_train_images = 9900946, num_val_images = 2470347, num_test_images = 3095080


In [80]:
image_shape = (128, 128)
num_class = 5270

def get_batch(bson_file, images_df, offsets_df, index_array, with_labels):
    batch_x = np.zeros((len(index_array),) + image_shape + (3, ), dtype=np.float16)
    batch_id = np.zeros(len(index_array), dtype=np.uint32)
    if with_labels:
        batch_y = np.zeros((len(batch_x), num_class), dtype=np.float16)

    for i, j in enumerate(index_array):
        image_row = images_df.iloc[j]
        product_id = image_row["product_id"]
        offset_row = offsets_df.loc[product_id]

        # Read this product's data from the BSON file.
        bson_file.seek(offset_row["offset"])
        item_data = bson_file.read(offset_row["length"])

        # Grab the image from the product.
        item = bson.BSON.decode(item_data)
        img_idx = image_row["img_idx"]
        bson_img = item["imgs"][img_idx]["picture"]

        # Preprocess the image.
        img = Image.open(io.BytesIO(bson_img))
        img = img.resize(image_shape)
        x = np.asarray(img, dtype=np.float16)
        
        #x = self.image_data_generator.random_transform(x)
        #x = self.image_data_generator.standardize(x)

        # Add the image and the label to the batch (one-hot encoded).
        batch_x[i] = x
        batch_id[i] = product_id
        if with_labels:
            batch_y[i, image_row["category_idx"]] = 1

    if with_labels:
        return batch_x, batch_y, batch_id
    else:
        return batch_x, batch_id


In [4]:
def model(x):
    x = tf.reshape(x, [-1, image_shape[0], image_shape[1], 3])
    x = tf.layers.conv2d(x, filters=64, kernel_size=[3, 3], strides=[2, 2], padding='same', activation=tf.nn.relu)
    x = tf.layers.batch_normalization(x, training=True)    
    x = tf.layers.conv2d(x, filters=64, kernel_size=[3, 3], strides=[1, 1], padding='same', activation=tf.nn.relu)
    x = tf.layers.batch_normalization(x, training=True)
    
    x = tf.layers.conv2d(x, filters=128, kernel_size=[3, 3], strides=[2, 2], padding='same', activation=tf.nn.relu)
    x = tf.layers.batch_normalization(x, training=True)
    x = tf.layers.conv2d(x, filters=128, kernel_size=[3, 3], strides=[1, 1], padding='same', activation=tf.nn.relu)
    x = tf.layers.batch_normalization(x, training=True)

    x = tf.layers.conv2d(x, filters=256, kernel_size=[3, 3], strides=[2, 2], padding='same', activation=tf.nn.relu)
    x = tf.layers.batch_normalization(x, training=True)
    x = tf.layers.conv2d(x, filters=256, kernel_size=[3, 3], strides=[1, 1], padding='same', activation=tf.nn.relu)
    x = tf.layers.batch_normalization(x, training=True)    
    x = tf.layers.conv2d(x, filters=256, kernel_size=[3, 3], strides=[1, 1], padding='same', activation=tf.nn.relu)
    x = tf.layers.batch_normalization(x, training=True)   
    x = tf.layers.conv2d(x, filters=256, kernel_size=[3, 3], strides=[1, 1], padding='same', activation=tf.nn.relu)
    x = tf.layers.batch_normalization(x, training=True)   
    
    x = tf.layers.conv2d(x, filters=512, kernel_size=[3, 3], strides=[2, 2], padding='same', activation=tf.nn.relu)
    x = tf.layers.batch_normalization(x, training=True)
    x = tf.layers.conv2d(x, filters=512, kernel_size=[3, 3], strides=[1, 1], padding='same', activation=tf.nn.relu)
    x = tf.layers.batch_normalization(x, training=True)    
    x = tf.layers.conv2d(x, filters=512, kernel_size=[3, 3], strides=[1, 1], padding='same', activation=tf.nn.relu)
    x = tf.layers.batch_normalization(x, training=True)   
    x = tf.layers.conv2d(x, filters=512, kernel_size=[3, 3], strides=[1, 1], padding='same', activation=tf.nn.relu)
    x = tf.layers.batch_normalization(x, training=True)  
    
    x = tf.contrib.layers.flatten(x)
    logits = tf.layers.dense(x, num_class)
    return logits

tf.reset_default_graph()
X = tf.placeholder(tf.float32, [None, image_shape[0], image_shape[1], 3])
y = tf.placeholder(tf.int64, [None, num_class])

logits = model(X)

loss = tf.reduce_mean(tf.losses.softmax_cross_entropy(y, logits=logits))

predictions = tf.equal(tf.argmax(logits, 1), tf.argmax(y, 1))
acc = tf.reduce_mean(tf.cast(predictions, tf.float32))

test_predictions = tf.argmax(logits, 1)

In [None]:
train_index_array_all = np.arange(num_train_images)
np.random.shuffle(train_index_array_all)

val_index_array_all = np.arange(num_val_images)
np.random.shuffle(val_index_array_all)

test_index_array_all = np.arange(num_test_images)

learning_rate = 1e-4
num_epochs = 1
val_batch_size = 500

def run_model(sess, train_step, learning_rate, show_every, validate_every):
    for epoch in range(num_epochs):
        print('epoch %d:' % epoch)
        log_file = open('20170927log.txt', 'a+')
        log_file.write('epoch %d: \n' % epoch)
        log_file.close()
        start = time.clock()
        for iter_i in range(num_train_images // batch_size + 1):
            if (iter_i > 3600):
                break
            train_batch_start_idx = iter_i * batch_size
            train_batch_index_array = train_index_array_all[train_batch_start_idx : train_batch_start_idx + batch_size]
            train_batch_x, train_batch_y, _ = get_batch(train_bson_file, train_images_df, train_offsets_df, train_batch_index_array, with_labels=True)
        
            _loss, _train_acc, _ = sess.run([loss, acc, train_step], feed_dict={X: train_batch_x, y: train_batch_y})
            
            if iter_i != 0 and iter_i % show_every == 0:
                print(time.clock() - start)
                log_file = open('20170927log.txt', 'a+')
                print('train_iter: %d, loss: %f, acc: %f' % (iter_i, _loss, _train_acc))
                log_file.write('train_iter: %d, loss: %f, acc: %f \n' % (iter_i, _loss, _train_acc))
                log_file.close()
                start = time.clock()
                
            if iter_i != 0 and iter_i % validate_every == 0:
                val_start = time.clock()
                val_iter_i = np.random.choice(num_val_images // val_batch_size, 1)
                val_batch_start_idx = (val_iter_i * val_batch_size) % num_val_images
                val_batch_index_array = val_index_array_all[val_batch_start_idx : val_batch_start_idx + val_batch_size]
                val_batch_x, val_batch_y = get_batch(train_bson_file, val_images_df, train_offsets_df, val_batch_index_array, with_labels=True)
                _val_acc = sess.run(acc, feed_dict={X: val_batch_x, y: val_batch_y})
                val_elapsed = time.clock() - val_start
                print('val_elapsed: %f, val acc: %f' % (val_elapsed, _val_acc))

In [None]:
sess = tf.Session()

train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)
sess.run(tf.global_variables_initializer())
print('training...')
log_file = open('20170927log.txt', 'a+')
log_file.write('training...\n')
log_file.close()
run_model(sess, train_step, learning_rate, show_every=10, validate_every=10000000)

training...
epoch 0:


In [86]:
test_batch_size = 1500
import csv
def test_model(sess, show_every):                
    #submission = pd.read_csv('/mnt/data/cdiscount/sample_submission.csv')
    submission_list = np.zeros([num_test_images, 2], dtype=np.uint32)
    test_start = time.clock()
    for test_iter_i in range(num_test_images // test_batch_size + 1):
        test_batch_start_idx = test_iter_i * test_batch_size
        test_batch_index_array = test_index_array_all[test_batch_start_idx : test_batch_start_idx + test_batch_size]
        test_batch_x, test_batch_id = get_batch(test_bson_file, test_images_df, test_offsets_df, test_batch_index_array, with_labels=False)   
        _test_predictions = sess.run(test_predictions, feed_dict={X: test_batch_x})
        
        submission_list[test_batch_start_idx: test_batch_start_idx + test_batch_size, 0] = test_batch_id
        submission_list[test_batch_start_idx: test_batch_start_idx + test_batch_size, 1] = [idx2cat[idx] for idx in _test_predictions]
        #tmp_start = time.clock()
        #for pred_i, item in enumerate(test_batch_id):
        #    submission.loc[submission['_id']==item, 'category_id'] = idx2cat[_test_predictions[pred_i]]
        #print(test_batch_id)
        #print([idx2cat[idx] for idx in _test_predictions])
        #submission.loc[submission['_id'].isin(test_batch_id), 'category_id'] = [idx2cat[idx] for idx in _test_predictions]
        #print('loc and assign time = %f' % (time.clock() - tmp_start))
        #log_file = open('20170927_test_log.txt', 'a+')
        #log_file.write('max test id: %d \n' % test_batch_id.max(axis=0))
        #log_file.close()
        
        if (test_iter_i % show_every == 0):
            print('test_iter: %d, test elapse: %f' % (test_iter_i, (time.clock() - test_start)))
            log_file = open('20170927_test_log.txt', 'a+')
            log_file.write('test_iter: %d, test elapse: %f \n' % (test_iter_i, (time.clock() - test_start)))
            log_file.close()
            test_start = time.clock()
    
    #submission.to_csv('new_submission_09261156.csv.gz', compression='gzip')
    with open("submission_20170927.csv", "w") as outfile:
        csvwriter = csv.writer(outfile)
        csvwriter.writerows(submission_list.tolist())

In [87]:
print('testing...')
#log_file = open('20170927log.txt', 'a+')
#log_file.write('testing...\n')
#log_file.close()
test_model(sess, show_every=1000)
print('done!')
log_file = open('20170927_test_log.txt', 'a+')
log_file.write('done!')
log_file.close()

testing...
done!
