In [1]:
import os, io
import numpy as np
import pandas as pd
from PIL import Image
import bson
import time

import tensorflow as tf

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
def make_category_tables():
    cat2idx = {}
    idx2cat = {}
    for ir in categories_df.itertuples():
        category_id = ir[0]
        category_idx = ir[4]
        cat2idx[category_id] = category_idx
        idx2cat[category_idx] = category_id
    return cat2idx, idx2cat

categories_df = pd.read_csv("categories.csv", index_col=0)
cat2idx, idx2cat = make_category_tables()

train_offsets_df = pd.read_csv("train_offsets.csv", index_col=0)
train_images_df = pd.read_csv("train_images.csv", index_col=0)
val_images_df = pd.read_csv("val_images.csv", index_col=0)

test_offsets_df = pd.read_csv("test_offsets.csv", index_col=0)
test_images_df = pd.read_csv("test_images.csv", index_col=0)

num_train_images = len(train_images_df)
num_val_images = len(val_images_df)
num_test_images = len(test_images_df)
print('num_train_images = %d, num_val_images = %d, num_test_images = %d' % (num_train_images, num_val_images, num_test_images))

data_dir = "/mnt/data/cdiscount/"

train_bson_path = os.path.join(data_dir, "train.bson")
test_bson_path = os.path.join(data_dir, "test.bson")

train_bson_file = open(train_bson_path, "rb")
test_bson_file = open(test_bson_path, "rb")

batch_size = 128

  mask |= (ar1 == a)


num_train_images = 9900012, num_val_images = 2471281, num_test_images = 3095080


In [3]:
image_shape = (160, 160)
num_class = 5270

def get_batch(bson_file, images_df, offsets_df, index_array, with_labels):
    batch_x = np.zeros((len(index_array),) + image_shape + (3, ), dtype=np.float16)
    batch_id = np.zeros(len(index_array), dtype=np.uint32)
    if with_labels:
        batch_y = np.zeros((len(batch_x), num_class), dtype=np.float16)

    for i, j in enumerate(index_array):
        image_row = images_df.iloc[j]
        product_id = image_row["product_id"]
        offset_row = offsets_df.loc[product_id]

        # Read this product's data from the BSON file.
        bson_file.seek(offset_row["offset"])
        item_data = bson_file.read(offset_row["length"])

        # Grab the image from the product.
        item = bson.BSON.decode(item_data)
        img_idx = image_row["img_idx"]
        bson_img = item["imgs"][img_idx]["picture"]

        # Preprocess the image.
        img = Image.open(io.BytesIO(bson_img))
        img = img.resize(image_shape)
        x = np.asarray(img, dtype=np.float16)
        
        #x = self.image_data_generator.random_transform(x)
        #x = self.image_data_generator.standardize(x)

        # Add the image and the label to the batch (one-hot encoded).
        batch_x[i] = x
        batch_id[i] = product_id
        if with_labels:
            batch_y[i, image_row["category_idx"]] = 1

    if with_labels:
        return batch_x, batch_y, batch_id
    else:
        return batch_x, batch_id


In [4]:
import tensorflow_vgg.vgg19_trainable as vgg19

tf.reset_default_graph()
X = tf.placeholder(tf.float32, [None, image_shape[0], image_shape[1], 3])
y = tf.placeholder(tf.int64, [None, num_class])
train_mode = tf.placeholder(tf.bool)

vgg = vgg19.Vgg19('vgg19.npy')
vgg.build(X, train_mode)

loss = tf.reduce_mean(tf.losses.softmax_cross_entropy(y, logits=vgg.prob))

predictions = tf.equal(tf.argmax(vgg.prob, 1), tf.argmax(y, 1))
acc = tf.reduce_mean(tf.cast(predictions, tf.float32))

test_predictions = tf.argmax(vgg.prob, 1)

In [5]:
train_index_array_all = np.arange(num_train_images)
np.random.shuffle(train_index_array_all)

val_index_array_all = np.arange(num_val_images)
np.random.shuffle(val_index_array_all)

test_index_array_all = np.arange(num_test_images)

learning_rate = 1e-4
num_epochs = 1
val_batch_size = 500

islog = True
log_filename='20171006_Train_TensorFlow_VGG19.txt'
def print_and_log(log_line):
    print(log_line)
    if islog:
        log_file = open(log_filename, 'a+')
        log_file.write(str(log_line) + '\n')
        log_file.close()

def run_model(sess, train_step, learning_rate, show_every, validate_every):
    for epoch in range(num_epochs):
        print_and_log('epoch %d:' % epoch)
        start = time.clock()
        for iter_i in range(num_train_images // batch_size + 1):
            train_batch_start_idx = iter_i * batch_size
            train_batch_index_array = train_index_array_all[train_batch_start_idx : train_batch_start_idx + batch_size]
            train_batch_x, train_batch_y, _ = get_batch(train_bson_file, train_images_df, train_offsets_df, train_batch_index_array, with_labels=True)
            
            _loss, _train_acc, _ = sess.run([loss, acc, train_step], feed_dict={X: train_batch_x, y: train_batch_y, train_mode: True})
            
            if iter_i != 0 and iter_i % show_every == 0:
                print_and_log(time.clock() - start)
                print_and_log('train_iter: %d, loss: %f, acc: %f' % (iter_i, _loss, _train_acc))
                start = time.clock()
                
            if iter_i != 0 and iter_i % validate_every == 0:
                val_start = time.clock()
                val_iter_i = np.random.choice(num_val_images // val_batch_size, 1)
                val_batch_start_idx = (val_iter_i * val_batch_size) % num_val_images
                val_batch_index_array = val_index_array_all[val_batch_start_idx : val_batch_start_idx + val_batch_size]
                val_batch_x, val_batch_y = get_batch(train_bson_file, val_images_df, train_offsets_df, val_batch_index_array, with_labels=True)
                _val_acc = sess.run(acc, feed_dict={X: val_batch_x, y: val_batch_y})
                val_elapsed = time.clock() - val_start
                print_and_log('val_elapsed: %f, val acc: %f' % (val_elapsed, _val_acc))

In [6]:
sess = tf.Session()

train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)
sess.run(tf.global_variables_initializer())
print_and_log('training...')
run_model(sess, train_step, learning_rate, show_every=10, validate_every=10000000)

training...
epoch 0:
67.260983
train_iter: 10, loss: 8.562300, acc: 0.007812
46.17024099999999
train_iter: 20, loss: 8.570112, acc: 0.000000
46.24098500000002
train_iter: 30, loss: 8.570112, acc: 0.000000


KeyboardInterrupt: 

In [None]:
print('testing...')
#log_file = open('20170927log.txt', 'a+')
#log_file.write('testing...\n')
#log_file.close()
test_model(sess, show_every=1000)
print_and_log('done!')