In [1]:
from hyperdash import monitor_cell

In [2]:
#特に前処理について.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from six.moves import cPickle as pickle
from six.moves import xrange  # pylint: disable=redefined-builtin
import collections
import os
import sys
import cv2
import tarfile
import numpy as np
from tqdm import tqdm
from absl import flags
import absl.logging as _logging  # pylint: disable=unused-import
import scipy.io
import tensorflow as tf
from randaugment import policies as found_policies
from randaugment import augmentation_transforms

In [3]:
CIFAR_TARNAME = "cifar-10-python.tar.gz"
CIFAR_DOWNLOAD_URL = "https://www.cs.toronto.edu/~kriz/" + CIFAR_TARNAME
SVHN_DOWNLOAD_URL = "http://ufldl.stanford.edu/housenumbers/{}_32x32.mat"

DOWNLOAD_DATA_FOLDER = "downloaded_data"
MERGE_DATA_FOLDER = "merged_raw_data"

random_seed = np.random.randint(0, 10000)

In [4]:
def format_sup_filename(split, sup_size=-1):
  if split == "test":
    return "test.tfrecord"
  elif split == "train" or split == "dev":
    if sup_size == -1:
      return "{}-full.tfrecord".format(split, sup_size)
    else:
      return "{}-size_{:d}.tfrecord".format(split, sup_size)

def format_unsup_filename(aug_copy_num):
  return "unsup-{:d}.tfrecord".format(aug_copy_num)


def _int64_feature(value):
  return tf.train.Feature(int64_list=tf.train.Int64List(value=list(value)))


def _float_feature(value):
  return tf.train.Feature(float_list=tf.train.FloatList(value=list(value)))


def get_raw_data_filenames(split):
    if split == "train":
        return ["data_batch_%d" % i for i in xrange(1, 6)]
    elif split == "test":
      return ["test_batch"]
    else:
        assert False


def read_pickle_from_file(filename):
  with tf.gfile.Open(filename, "rb") as f:
    if sys.version_info >= (3, 0):
      data_dict = pickle.load(f, encoding="bytes")
    else:
      data_dict = pickle.load(f)
  return data_dict


def obtain_tfrecord_writer(out_path, shard_cnt):
  tfrecord_writer = tf.python_io.TFRecordWriter(
      "{}.{:d}".format(out_path, shard_cnt))
  return tfrecord_writer


def save_tfrecord(example_list, out_path, max_shard_size=4096):
  shard_cnt = 0
  shard_size = 0
  record_writer = obtain_tfrecord_writer(out_path, shard_cnt)
  for example in example_list:
    if shard_size >= max_shard_size:
      record_writer.close()
      shard_cnt += 1
      record_writer = obtain_tfrecord_writer(out_path, shard_cnt)
      shard_size = 0
    shard_size += 1
    record_writer.write(example.SerializeToString())
  record_writer.close()
  tf.logging.info("saved {} examples to {}".format(len(example_list), out_path))


def save_merged_data(images, labels, split, merge_folder):
  with tf.gfile.Open(
      os.path.join(merge_folder, "{}_images.npy".format(split)), "wb") as ouf:
    np.save(ouf, images)
  with tf.gfile.Open(
      os.path.join(merge_folder, "{}_labels.npy".format(split)), "wb") as ouf:
    np.save(ouf, labels)

In [5]:
all_exist = True
download_folder = os.path.join(DOWNLOAD_DATA_FOLDER)
merge_folder = os.path.join(MERGE_DATA_FOLDER)
for split in ["train", "test"]:
    for field in ["images", "labels"]:
        if not tf.gfile.Exists(os.path.join(merge_folder, "{}_{}.npy".format(split, field))):
            all_exist = False
if all_exist:
    tf.logging.info("found all merged files")
print('all_exist:' + str(all_exist))
tf.logging.info("downloading dataset")
tf.gfile.MakeDirs(download_folder)
tf.gfile.MakeDirs(merge_folder)

INFO:tensorflow:found all merged files
all_exist:True
INFO:tensorflow:downloading dataset


In [6]:
#if FLAGS.task_name == 'cifar10'以下の話
tf.contrib.learn.datasets.base.maybe_download(
    CIFAR_TARNAME, download_folder, CIFAR_DOWNLOAD_URL)
tarfile.open(os.path.join(download_folder, CIFAR_TARNAME), "r:gz").extractall(download_folder)

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please write your own downloading logic.


In [7]:
for split in ["train", "test"]:
    images_list = []
    labels_list = []
    for filename in get_raw_data_filenames(split):
        cur_data = read_pickle_from_file(
            os.path.join(download_folder, "cifar-10-batches-py", filename))
        labels_list += [cur_data[b"labels"]]
        images_list += [cur_data[b"data"]]
    images = np.concatenate(images_list, 0)
    labels = np.concatenate(labels_list, 0)
    images = images.reshape([-1, 3, 32, 32])
    images = images.transpose(0, 2, 3, 1)
    save_merged_data(images, labels, split, merge_folder)

In [8]:
#多分iterableにimageとlabelを取り出す.最後まで, 可能な限り
def get_data_by_size_lim(images, labels, sup_size, return_rest=False):
    chosen_images = images[:sup_size]
    chosen_labels = labels[:sup_size]
    rest_images = images[sup_size:]
    rest_labels = labels[sup_size:]
    if return_rest:
        return chosen_images, chosen_labels, rest_images, rest_labels

In [9]:
def process_and_save_sup_data(chosen_images, chosen_labels, split, sup_size=-1):
    chosen_images = chosen_images / 255.0
    mean, std = [0.49139968, 0.48215841, 0.44653091], [0.24703223, 0.24348513, 0.26158784]
    chosen_images = (chosen_images - mean) / std
    example_list = []
    for image, label in zip(chosen_images, chosen_labels):
        example = tf.train.Example(features=tf.train.Features(
            feature={
                "image": _float_feature(image.reshape(-1)),
                "label": _int64_feature(label.reshape(-1))
            }))
        example_list += [example]
    out_path = os.path.join(output_base_dir, format_sup_filename(split, sup_size))
    tf.logging.info(">>saving {} {} examples to {}".format(len(example_list), split, out_path))
    save_tfrecord(example_list, out_path)

In [10]:
def proc_and_dump_sup_data(sub_set_data, split, sup_size=-1):
    images = sub_set_data["images"]
    labels = sub_set_data["labels"]
    if sup_size != -1:
        chosen_images, chosen_labels = get_data_by_size_lim(
            images, labels, sup_size)
    else:
        chosen_images = images
        chosen_labels = labels
    process_and_save_sup_data(chosen_images, chosen_labels, split)

In [11]:
#unsupervised dataのみをrand augmentする.
def proc_and_dump_unsup_data(sub_set_data, aug_copy_num):
    ori_images = sub_set_data["images"].copy()
    image_idx = np.arange(len(ori_images))
    np.random.shuffle(image_idx)
    ori_images = ori_images[image_idx]
    ori_images = ori_images / 255.0
    mean, std = [0.49139968, 0.48215841, 0.44653091], [0.24703223, 0.24348513, 0.26158784]
    ori_images = (ori_images - mean) / std
    aug_policies = found_policies.randaug_policies()
    example_list = []
    for image in ori_images:
        chosen_policy = aug_policies[np.random.choice(
            len(aug_policies))]
        #この段階で, chosen_policyは2個の要素を持っている.
        aug_image = augmentation_transforms.apply_policy(
            chosen_policy, image)
        aug_image = augmentation_transforms.cutout_numpy(aug_image)
        #ここで条件式を投げて, aug_imageが一定の閾値以下であれば3回目のaugmentationを加える.
        example = tf.train.Example(features=tf.train.Features(
            feature={
                "ori_image": _float_feature(image.reshape(-1)),
                "aug_image": _float_feature(aug_image.reshape(-1)),
            }))
        example_list += [example]
    out_path = os.path.join(output_base_dir, format_unsup_filename(aug_copy_num),)
    save_tfrecord(example_list, out_path)

In [12]:
def load_dataset():
    data = {}
    merge_folder = os.path.join(MERGE_DATA_FOLDER)
    for split in ["train", "test"]:
        with tf.gfile.Open(
            os.path.join(merge_folder, "{}_images.npy".format(split)), 'rb') as inf:
            images = np.load(inf)
        with tf.gfile.Open(
            os.path.join(merge_folder, "{}_labels.npy".format(split)), 'rb') as inf:
            labels = np.load(inf)
        data[split] = {"images":images, "labels":labels}
    return data

In [13]:
data = load_dataset()

In [14]:
!mkdir dataset_by_randaugment_v5

In [15]:
output_base_dir = "dataset_by_randaugment_v5"

In [16]:
def randaug(sub_set_data, aug_copy_num):
    ori_images = sub_set_data["images"].copy()
    image_idx = np.arange(len(ori_images))
    np.random.shuffle(image_idx)
    ori_images = ori_images[image_idx]
    ori_images = ori_images / 255.0
    mean, std = [0.49139968, 0.48215841, 0.44653091], [0.24703223, 0.24348513, 0.26158784]
    ori_images = (ori_images - mean) / std
    aug_policies = found_policies.randaug_policies()
    example_list = []
    i = 0
    for image in ori_images:
        chosen_policy = aug_policies[np.random.choice(len(aug_policies))]
        aug_image = augmentation_transforms.apply_policy(chosen_policy, image)
        aug_image = augmentation_transforms.cutout_numpy(aug_image)
        example = tf.train.Example(features=tf.train.Features(
            feature={
                "ori_image": _float_feature(image.reshape(-1)),
                "aug_image": _float_feature(aug_image.reshape(-1)),
            }))
        example_list += [example]
        i += 1
        if i%1000 == 0:
            print(str(i) + "done!")
    out_path = os.path.join(output_base_dir, format_unsup_filename(aug_copy_num),)
    save_tfrecord(example_list, out_path)
    return 

In [17]:
for i in range(10):
    randaug(data["train"], i)

INFO:tensorflow:trans_list: ['Invert', 'Cutout', 'Sharpness', 'AutoContrast', 'Posterize', 'ShearX', 'TranslateX', 'TranslateY', 'ShearY', 'Rotate', 'Equalize', 'Contrast', 'Color', 'Solarize', 'Brightness']
1000done!
2000done!
3000done!
4000done!
5000done!
6000done!
7000done!
8000done!
9000done!
10000done!
11000done!
12000done!
13000done!
14000done!
15000done!
16000done!
17000done!
18000done!
19000done!
20000done!
21000done!
22000done!
23000done!
24000done!
25000done!
26000done!
27000done!
28000done!
29000done!
30000done!
31000done!
32000done!
33000done!
34000done!
35000done!
36000done!
37000done!
38000done!
39000done!
40000done!
41000done!
42000done!
43000done!
44000done!
45000done!
46000done!
47000done!
48000done!
49000done!
50000done!
INFO:tensorflow:saved 50000 examples to dataset_by_randaugment_v5/unsup-0.tfrecord
INFO:tensorflow:trans_list: ['Invert', 'Cutout', 'Sharpness', 'AutoContrast', 'Posterize', 'ShearX', 'TranslateX', 'TranslateY', 'ShearY', 'Rotate', 'Equalize', 'Contra

47000done!
48000done!
49000done!
50000done!
INFO:tensorflow:saved 50000 examples to dataset_by_randaugment_v5/unsup-9.tfrecord


In [1]:
import numpy as np
from scipy import stats

In [4]:
A = np.array([0.7951000, 0.7915000, 0.7878999])
B = np.array([0.7678000, 0.7512000, 0.7695000])

In [5]:
stats.ttest_rel(A, B)

Ttest_relResult(statistic=4.508157842784919, pvalue=0.04584683489561074)