In [10]:
import numpy as np
import pandas as pd
import io
import bson
import cv2
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import concurrent.futures
from multiprocessing import cpu_count

In [11]:
num_images = 1000000
im_size = 16
num_cpus = cpu_count()

In [12]:
def imread(buf):
    return cv2.imdecode(np.frombuffer(buf, np.uint8), cv2.IMREAD_ANYCOLOR)

def img2feat(im):
    x = cv2.resize(im, (im_size, im_size), interpolation=cv2.INTER_AREA)
    return np.float32(x) / 255

images = np.empty((num_images, im_size, im_size, 3), dtype=np.float32)
labels = []

def load_image(pic, target, bar):
    picture = imread(pic)
    x = img2feat(picture)
    bar.update()
    
    return x, target

bar = tqdm_notebook(total=num_images)
with open('/mnt/data/cdiscount/train.bson', 'rb') as f, \
        concurrent.futures.ThreadPoolExecutor(num_cpus) as executor:

    data = bson.decode_file_iter(f)
    delayed_load = []

    i = 0
    try:
        for c, d in enumerate(data):
            target = d['category_id']
            for e, pic in enumerate(d['imgs']):
                delayed_load.append(executor.submit(load_image, pic['picture'], target, bar))
                
                i = i + 1

                if i >= num_images:
                    raise IndexError()

    except IndexError:
        pass;
    
    for i, future in enumerate(concurrent.futures.as_completed(delayed_load)):
        x, target = future.result()
        
        images[i] = x
        labels.append(target)

Widget Javascript not detected.  It may not be installed or enabled properly.





In [13]:
images.shape, len(labels)

((1000000, 16, 16, 3), 1000000)

In [14]:
labels = pd.Series(labels)

num_classes = 1000 
valid_targets = set(labels.value_counts().index[:num_classes-1].tolist())
valid_labels = labels.isin(valid_targets)

labels[~valid_labels] = -1

max_acc = valid_labels.mean()
print(max_acc)

0.885605


In [15]:
labels, rev_labels = pd.factorize(labels)

In [16]:
labels.shape
print(labels[600])

7


In [None]:
num_train = 900000
num_val   = 100000

indicies = np.arange(num_images)
np.random.shuffle(indicies)
train_mask = indicies[range(num_train)]
val_mask = indicies[range(num_train, num_train + num_val)]

train_image = images[train_mask]
train_label = labels[train_mask]
val_image = images[val_mask]
val_label = labels[val_mask]

print('train image shape: ', train_image.shape)
print('train label shape: ', train_label.shape)
print('val image shape: ', val_image.shape)
print('val label shape: ', val_label.shape)

train image shape:  (900000, 16, 16, 3)
train label shape:  (900000,)
val image shape:  (100000, 16, 16, 3)
val label shape:  (100000,)


In [None]:
import tensorflow as tf
import numpy as np

num_epochs = 10
batch_size = 128
val_batch_size = 10000
learning_rate = 1e-4

num_test = 60000#test_images.shape[0]

def model(x):
    x = tf.reshape(x, [-1, 16, 16, 3])
    x = tf.layers.conv2d(x, filters=32, kernel_size=[3, 3], strides=[1, 1], padding='same', activation=tf.nn.relu)
    x = tf.layers.batch_normalization(x, training=True)
    x = tf.layers.conv2d(x, filters=32, kernel_size=[3, 3], strides=[1, 1], padding='same', activation=tf.nn.relu)
    x = tf.layers.batch_normalization(x, training=True)
    x = tf.layers.conv2d(x, filters=32, kernel_size=[3, 3], strides=[1, 1], padding='same', activation=tf.nn.relu)
    x = tf.layers.batch_normalization(x, training=True)
    
    x = tf.layers.conv2d(x, filters=64, kernel_size=[3, 3], strides=[2, 2], padding='same', activation=tf.nn.relu)
    x = tf.layers.batch_normalization(x, training=True)    
    
    x = tf.layers.conv2d(x, filters=64, kernel_size=[3, 3], strides=[1, 1], padding='same', activation=tf.nn.relu)
    x = tf.layers.batch_normalization(x, training=True)
    x = tf.layers.conv2d(x, filters=64, kernel_size=[3, 3], strides=[1, 1], padding='same', activation=tf.nn.relu)
    x = tf.layers.batch_normalization(x, training=True)
    
    x = tf.layers.conv2d(x, filters=128, kernel_size=[3, 3], strides=[2, 2], padding='same', activation=tf.nn.relu)
    x = tf.layers.batch_normalization(x, training=True)
    
    x = tf.layers.conv2d(x, filters=128, kernel_size=[3, 3], strides=[1, 1], padding='same', activation=tf.nn.relu)
    x = tf.layers.batch_normalization(x, training=True)
    x = tf.layers.conv2d(x, filters=128, kernel_size=[3, 3], strides=[1, 1], padding='same', activation=tf.nn.relu)
    x = tf.layers.batch_normalization(x, training=True) 

    x = tf.contrib.layers.flatten(x)
    logits = tf.layers.dense(x, num_classes)
    return logits

tf.reset_default_graph()
X = tf.placeholder(tf.float32, [None, 16, 16, 3])
y = tf.placeholder(tf.int64, [None])

logits = model(X)

loss = tf.reduce_mean(tf.losses.softmax_cross_entropy(tf.one_hot(y, num_classes), logits=logits))

predictions = tf.equal(tf.argmax(logits, 1), y)
acc = tf.reduce_mean(tf.cast(predictions, tf.float32))

train_indicies = np.arange(num_train)
np.random.shuffle(train_indicies)

val_indicies = np.arange(num_val)
test_indicies = np.arange(num_test)

test_predictions = tf.argmax(logits, 1)

saver = tf.train.Saver()

def run_model(sess, train_step, batch_size, learning_rate, show_every):
    print('training with batch_size: %d, learning_rate: %f'%(batch_size, learning_rate))
    max_val_acc = .0
    
    for epoch in range(num_epochs):
        print('epoch %d:' % epoch)
        for iter_i in range(num_train//batch_size):  
            start_idx = (iter_i*batch_size)%num_train
            idx = train_indicies[start_idx:start_idx+batch_size]
            
            _loss, _acc, _ = sess.run([loss, acc, train_step], feed_dict={
                X: train_image[idx, :], y: train_label[idx]
            })
            if iter_i % show_every == 0:
                print('iter: %d, loss: %f, acc: %f' % (iter_i, _loss, _acc))

        #total_correct = []
        for iter_i in range(num_val//val_batch_size):
            start_idx = (iter_i * val_batch_size) % num_val
            idx = val_indicies[start_idx:start_idx+val_batch_size]
            val_acc = sess.run(acc, feed_dict={
                X:val_image[idx, :], y: val_label[idx]
            })
            print('val set acc: %f'% val_acc)
            #if (val_acc > max_val_acc):
            #    max_val_acc = val_acc
            #total_correct = total_correct + val_correct_predictions
            
#         if (val_acc >= 0.995):
#             print('bingo!')
#             total_test_predictions = []
#             test_batch_size = 7000
#             for test_iter in range(num_test//test_batch_size):
#                 start_test_idx = (test_iter * test_batch_size)%num_test
#                 test_idx = test_indicies[start_test_idx:start_test_idx+test_batch_size]
#                 _test_predictions = sess.run(test_predictions, feed_dict={X:test_images[test_idx, :]})
#                 total_test_predictions = np.concatenate((total_test_predictions,_test_predictions))
            
#             result = total_test_predictions
#             with open('submission_%d.csv'%(epoch), 'w', newline='') as csvfile:
#                 datawriter = csv.writer(csvfile, delimiter=',')
#                 datawriter.writerow(['ImageId', 'Label'])
#                 for i, predict_label in enumerate(result):
#                     datawriter.writerow([i+1, predict_label.astype(np.uint8)])
        
#         save_path = "../ckpt/epoch%d/model.ckpt"%epoch
#         saver.save(sess, save_path)
    return max_val_acc
#     print('predicting:')
#     total_test_predictions = []
#     test_batch_size = 7000
#     for test_iter in range(num_test//test_batch_size):
#         start_test_idx = (test_iter * test_batch_size)%num_test
#         test_idx = test_indicies[start_test_idx:start_test_idx+test_batch_size]
#         _test_predictions = sess.run(test_predictions, feed_dict={X:test_images[test_idx, :]})
#         total_test_predictions = np.concatenate((total_test_predictions,_test_predictions))
#     return total_test_predictions

def test(sess):
    submission = pd.read_csv('/mnt/data/cdiscount/sample_submission.csv', index_col='_id')
    most_frequent_guess =1000018296
    submission['category_id'] = most_frequent_guess 

    num_images_test = 1768182
    with open('/mnt/data/cdiscount/test.bson', 'rb') as f, \
             concurrent.futures.ThreadPoolExecutor(num_cpus) as executor:

        data = bson.decode_file_iter(f)
        future_load = []

        for i,d in enumerate(data):
            if i >= num_images_test:
                  break
            future_load.append(executor.submit(load_image, d['imgs'][0]['picture'], d['_id'], bar))
        
            #print("Starting future processing")
        for future in concurrent.futures.as_completed(future_load):
            x, _id = future.result()
            x = np.reshape(x, [-1, 16, 16, 3])
            _test_predictions = sess.run(test_predictions, feed_dict={X:x})
            y_cat = rev_labels[np.argmax(_test_predictions)]
            if y_cat == -1:
                y_cat = most_frequent_guess

            bar.update()
            submission.loc[_id, 'category_id'] = y_cat
    submission.to_csv('new_submission.csv.gz', compression='gzip')

batch_size_arr = [128]
learning_rate_arr = [5e-5]

with tf.Session() as sess:
    for batch_size in batch_size_arr:
        for learning_rate in learning_rate_arr:
            train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)
            
            sess.run(tf.global_variables_initializer())
            val_acc = run_model(sess, train_step, batch_size, learning_rate, show_every=1000)
            #print('best_val_acc: %f'%val_acc)
    del images
    test(sess)
print('done!')

training with batch_size: 128, learning_rate: 0.000050
epoch 0:
iter: 0, loss: 7.553686, acc: 0.000000
1000000/|/100%|| 1000000/1000000 [03:48<00:00, 5356.13it/s]iter: 1000, loss: 4.855631, acc: 0.195312
iter: 2000, loss: 3.744300, acc: 0.351562
iter: 3000, loss: 3.664635, acc: 0.320312
iter: 4000, loss: 3.505937, acc: 0.406250
iter: 5000, loss: 3.295086, acc: 0.421875
iter: 6000, loss: 3.453911, acc: 0.382812
iter: 7000, loss: 3.770164, acc: 0.375000
val set acc: 0.392600
val set acc: 0.387400
val set acc: 0.391000
val set acc: 0.386200
val set acc: 0.390800
val set acc: 0.398000
val set acc: 0.391300
val set acc: 0.389700
val set acc: 0.386000
val set acc: 0.391500
epoch 1:
iter: 0, loss: 2.773903, acc: 0.484375
iter: 1000, loss: 3.831333, acc: 0.328125
iter: 2000, loss: 2.891819, acc: 0.445312
iter: 3000, loss: 3.022703, acc: 0.390625
iter: 4000, loss: 2.954076, acc: 0.453125
iter: 5000, loss: 2.845863, acc: 0.468750
iter: 6000, loss: 3.021114, acc: 0.414062
iter: 7000, loss: 3.3734