In [95]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import cv2
import os
import boto3
import io
from math import sqrt
from sklearn.model_selection import train_test_split
import tensorflow as tf
from PIL import Image
import dataset_util
from collections import namedtuple, OrderedDict
from skimage.measure import regionprops
%matplotlib inline

In [43]:
df = pd.read_csv("all/train_ship_segmentations.csv", header=0)
df_unique = df.drop_duplicates("ImageId")

In [44]:
df_random = df_unique.sample(frac=1)
th = np.random.rand(len(df_random)) < 0.9

In [45]:
train_test = df_random[th]

In [46]:
validation = df_random[~th]
train, test = train_test_split(train_test, test_size=0.2)

##### Functions

##### Create tf records 

In [90]:
def rle_decode(mask_rle, shape=(768, 768)):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape).T  # Needed to align to RLE direction


def rle_to_box(mask_rle):
    m = rle_decode(mask_rle)
    ps = regionprops(m)
    return ps[0].bbox

def create_tf_example(path, group):
    with tf.gfile.GFile("/Users/yannis/Developpement/kaggle/airbus/all/train/"+path, 'rb') as fid:
        encoded_jpg = fid.read()
    width, height = (768, 768)
    
    filename = path.encode('utf8')
    image_format = b'jpg'
    xmins = []
    xmaxs = []
    ymins = []
    ymaxs = []
    classes_text = []
    classes = []
    
    for g in group:
        if str(g) != "nan" : 
            pixel = rle_to_box(g)
            xmins.append(pixel[0] / width)
            ymins.append(pixel[1] / height)
            xmaxs.append(pixel[2] / width)
            ymaxs.append(pixel[3] / height)
            classes_text.append("ship".encode('utf8'))
            classes.append(1)
    
    tf_example = tf.train.Example(features=tf.train.Features(feature={
      'image/height': dataset_util.int64_feature(height),
      'image/width': dataset_util.int64_feature(width),
      'image/filename': dataset_util.bytes_feature(filename),
      'image/source_id': dataset_util.bytes_feature(filename),
      'image/encoded': dataset_util.bytes_feature(encoded_jpg),
      'image/format': dataset_util.bytes_feature(image_format),
      'image/object/bbox/xmin': dataset_util.float_list_feature(xmins),
      'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs),
      'image/object/bbox/ymin': dataset_util.float_list_feature(ymins),
      'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs),
      'image/object/class/text': dataset_util.bytes_list_feature(classes_text),
      'image/object/class/label': dataset_util.int64_list_feature(classes),
    }))
    return tf_example


def create_record(dataset, name, df):
    writer = tf.python_io.TFRecordWriter(name+".records")

    for index, row in dataset.iterrows():
        row = row.tolist()
        group = df.loc[df['ImageId'] == row[0]]["EncodedPixels"]
        tf_example = create_tf_example(row[0], group)
        writer.write(tf_example.SerializeToString())

    writer.close()

Create train record

In [93]:
create_record(train, "train", df)

In [None]:
create_record(test, "test", df)

### upload data to s3

In [None]:
# bash
# aws s3 cp data/train.recods s3://kaggle-yannis/airbus/train.records
# aws s3 cp data/test.recods s3://kaggle-yannis/airbus/test.records