In [1]:
import urllib
from IPython.display import Markdown as md

### change to reflect your notebook
_nb_loc = "05_create_dataset/05_split_tfrecord.ipynb"
_nb_title = "Splitting dataset and writing TF Records"

_icons=["https://raw.githubusercontent.com/GoogleCloudPlatform/practical-ml-vision-book/master/logo-cloud.png", "https://www.tensorflow.org/images/colab_logo_32px.png", "https://www.tensorflow.org/images/GitHub-Mark-32px.png", "https://www.tensorflow.org/images/download_logo_32px.png"]
_links=["https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?" + urllib.parse.urlencode({"name": _nb_title, "download_url": "https://github.com/takumiohym/practical-ml-vision-book-ja/raw/master/"+_nb_loc}), "https://colab.research.google.com/github/takumiohym/practical-ml-vision-book-ja/blob/master/{0}".format(_nb_loc), "https://github.com/takumiohym/practical-ml-vision-book-ja/blob/master/{0}".format(_nb_loc), "https://raw.githubusercontent.com/takumiohym/practical-ml-vision-book-ja/master/{0}".format(_nb_loc)]
md("""<table class="tfo-notebook-buttons" align="left"><td><a target="_blank" href="{0}"><img src="{4}"/>Run in Vertex AI Workbench</a></td><td><a target="_blank" href="{1}"><img src="{5}" />Run in Google Colab</a></td><td><a target="_blank" href="{2}"><img src="{6}" />View source on GitHub</a></td><td><a href="{3}"><img src="{7}" />Download notebook</a></td></table><br/><br/>""".format(_links[0], _links[1], _links[2], _links[3], _icons[0], _icons[1], _icons[2], _icons[3]))

<table class="tfo-notebook-buttons" align="left"><td><a target="_blank" href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?name=Splitting+dataset+and+writing+TF+Records&download_url=https%3A%2F%2Fgithub.com%2Ftakumiohym%2Fpractical-ml-vision-book-ja%2Fraw%2Fmaster%2F05_create_dataset%2F05_split_tfrecord.ipynb"><img src="https://raw.githubusercontent.com/GoogleCloudPlatform/practical-ml-vision-book/master/logo-cloud.png"/>Run in Vertex AI Workbench</a></td><td><a target="_blank" href="https://colab.research.google.com/github/takumiohym/practical-ml-vision-book-ja/blob/master/05_create_dataset/05_split_tfrecord.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a></td><td><a target="_blank" href="https://github.com/takumiohym/practical-ml-vision-book-ja/blob/master/05_create_dataset/05_split_tfrecord.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a></td><td><a href="https://raw.githubusercontent.com/takumiohym/practical-ml-vision-book-ja/master/05_create_dataset/05_split_tfrecord.ipynb"><img src="https://www.tensorflow.org/images/download_logo_32px.png" />Download notebook</a></td></table><br/><br/>

# データセットの分割とTFRecordsの書き込み  

このノートブックでは、データセットを学習用、検証用、テスト用に分割し、それらの画像をTensorFlow Recordsファイルに書き込む方法を示しています。  


In [None]:
!pip install -q apache-beam==2.38.0

必要に応じて、以下の`PROJECT`や`BUCKET`を変更してから実行してください。

In [None]:
PROJECT = !gcloud config get-value project
BUCKET=PROJECT[0]

%env PROJECT=$PROJECT
%env BUCKET=$BUCKET

In [1]:
import pandas as pd
df = pd.read_csv('gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/all_data.csv', names=['image','label'])
df.head()

Unnamed: 0,image,label
0,gs://practical-ml-vision-book/flowers_5_jpeg/f...,daisy
1,gs://practical-ml-vision-book/flowers_5_jpeg/f...,daisy
2,gs://practical-ml-vision-book/flowers_5_jpeg/f...,daisy
3,gs://practical-ml-vision-book/flowers_5_jpeg/f...,daisy
4,gs://practical-ml-vision-book/flowers_5_jpeg/f...,daisy


In [2]:
import numpy as np
np.random.seed(10)
rnd = np.random.rand(len(df))
train = df[ rnd < 0.8  ]
valid = df[ (rnd >= 0.8) & (rnd < 0.9) ]
test  = df[ rnd >= 0.9 ]
print(len(df), len(train), len(valid), len(test))

3670 2930 359 381


In [3]:
%%bash
rm -rf output
mkdir output

In [4]:
train.to_csv('output/train.csv', header=False, index=False)
valid.to_csv('output/valid.csv', header=False, index=False)
test.to_csv('output/test.csv', header=False, index=False)

In [5]:
!head output/test.csv

gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/daisy/10466290366_cc72e33532.jpg,daisy
gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/daisy/10712722853_5632165b04.jpg,daisy
gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/daisy/11642632_1e7627a2cc.jpg,daisy
gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/daisy/13583238844_573df2de8e_m.jpg,daisy
gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/daisy/1374193928_a52320eafa.jpg,daisy
gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/daisy/13953307149_f8de6a768c_m.jpg,daisy
gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/daisy/14471433500_cdaa22e3ea_m.jpg,daisy
gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/daisy/14523675369_97c31d0b5b.jpg,daisy
gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/daisy/163978992_8128b49d3e_n.jpg,daisy
gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/daisy/16401288243_36112bd52f_m.jpg,daisy


## ApacheBeamを使用したTFレコードの書き込み  

速度を上げるために、5つのレコードだけを書き込む方法を説明します

In [6]:
outdf = test.head()
len(outdf)

5

In [7]:
outdf.values

array([['gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/daisy/10466290366_cc72e33532.jpg',
        'daisy'],
       ['gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/daisy/10712722853_5632165b04.jpg',
        'daisy'],
       ['gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/daisy/11642632_1e7627a2cc.jpg',
        'daisy'],
       ['gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/daisy/13583238844_573df2de8e_m.jpg',
        'daisy'],
       ['gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/daisy/1374193928_a52320eafa.jpg',
        'daisy']], dtype=object)

In [8]:
!gsutil cat gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/dict.txt

daisy
dandelion
roses
sunflowers
tulips


In [9]:
import tensorflow as tf
with tf.io.gfile.GFile('gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/dict.txt', 'r') as f:
    LABELS = [line.rstrip() for line in f]
print('Read in {} labels, from {} to {}'.format(
    len(LABELS), LABELS[0], LABELS[-1]))

Read in 5 labels, from daisy to tulips


In [10]:
import apache_beam as beam
import tensorflow as tf

def _string_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode('utf-8')]))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def read_and_decode(filename):
    IMG_CHANNELS = 3
    img = tf.io.read_file(filename)
    img = tf.image.decode_jpeg(img, channels=IMG_CHANNELS)
    img = tf.image.convert_image_dtype(img, tf.float32)
    return img

def create_tfrecord(filename, label, label_int):
    print(filename)
    img = read_and_decode(filename)
    dims = img.shape
    img = tf.reshape(img, [-1]) # flatten to 1D array
    return tf.train.Example(features=tf.train.Features(feature={
        'image': _float_feature(img),
        'shape': _int64_feature([dims[0], dims[1], dims[2]]),
        'label': _string_feature(label),
        'label_int': _int64_feature([label_int])
    })).SerializeToString()

with beam.Pipeline() as p:
    (p 
     | 'input_df' >> beam.Create(outdf.values)
     | 'create_tfrecord' >> beam.Map(lambda x: create_tfrecord(x[0], x[1], LABELS.index(x[1])))
     | 'write' >> beam.io.tfrecordio.WriteToTFRecord('output/train')
    )





gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/daisy/10466290366_cc72e33532.jpg


2022-07-10 14:14:55.829313: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/daisy/10712722853_5632165b04.jpg
gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/daisy/11642632_1e7627a2cc.jpg
gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/daisy/13583238844_573df2de8e_m.jpg
gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/daisy/1374193928_a52320eafa.jpg


In [11]:
!ls -l output/train*

-rw-r--r-- 1 jupyter jupyter 8777320 Jul 10 14:24 output/train-00000-of-00001
-rw-r--r-- 1 jupyter jupyter  300932 Jul 10 14:14 output/train.csv


In [12]:
## splitting in Apache Beam
def hardcoded(x, desired_split):
    split, rec = x
    print('hardcoded: ', split, rec, desired_split, split == desired_split)
    if split == desired_split:
        yield rec

with beam.Pipeline() as p:
        splits = (p
                  | 'input_df' >> beam.Create([
                      ('train', 'a'),
                      ('train', 'b'),
                      ('valid', 'c'),
                      ('valid', 'd')
                  ]))
        
        split = 'train'
        _ = (splits
                 | 'h_only_{}'.format(split) >> beam.FlatMap(
                     lambda x: hardcoded(x, 'train'))
         )        
        split = 'valid'
        _ = (splits
                 | 'h_only_{}'.format(split) >> beam.FlatMap(
                     lambda x: hardcoded(x, 'valid'))
        )



hardcoded:  train a train True
hardcoded:  train a valid False
hardcoded:  train b train True
hardcoded:  train b valid False
hardcoded:  valid c train False
hardcoded:  valid c valid True
hardcoded:  valid d train False
hardcoded:  valid d valid True


## Dataflowで実行  

Apache Beamコードは、Cloud Dataflowを使用してサーバーレスに実行できます。

`./jpeg_to_tfrecord.py` にCloud Dataflow用のコードが用意されています。<br>
beam.Pipeline() がクラウド上での実行用に以下のように書き換えられていることに注目してください。 


```
options = {
      'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'),
      'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
      'job_name': JOBNAME,
      'project': PROJECT,
      'teardown_policy': 'TEARDOWN_ALWAYS',
      'save_main_session': True
  }
opts = beam.pipeline.PipelineOptions(flags=[], **options)
with beam.Pipeline(RUNNER, options=opts) as p:
```

In [None]:
%%bash
python -m jpeg_to_tfrecord \
       --all_data gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/all_data.csv \
       --labels_file gs://practical-ml-vision-book/flowers_5_jpeg/flower_photos/dict.txt \
       --project_id $PROJECT \
       --output_dir gs://${BUCKET}/data/flower_tfrecords

<img src="dataflow_pipeline.png" width="75%"/>

In [None]:
%%bash
gsutil ls -l gs://${BUCKET}/data/flower_tfrecords/*-00001-*

## License
Copyright 2022 Google Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.