##### Copyright 2021 The TensorFlow Authors.

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Apache ORC Reader

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://www.tensorflow.org/io/tutorials/orc"><img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/io/blob/master/docs/tutorials/orc.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/io/blob/master/docs/tutorials/orc.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View on GitHub</a>
  </td>
  <td>
    <a href="https://storage.googleapis.com/tensorflow_docs/io/docs/tutorials/orc.ipynb"><img src="https://www.tensorflow.org/images/download_logo_32px.png" />Download notebook</a>
  </td>
</table>

## Overview

Apache ORC is a popular columnar storage format. tensorflow-io package provides a default implementation of reading Apache ORC files.

## Setup

Install required Packages, and restart runtime


In [1]:
!pip install tensorflow-io

Collecting tensorflow-io
[?25l  Downloading https://files.pythonhosted.org/packages/e6/d2/6fd39a3519e325037462721092248b468ccbeeeb5dc870cea072655ee4b0/tensorflow_io-0.18.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1MB)
[K     |████████████████████████████████| 24.1MB 118kB/s 
[?25hCollecting tensorflow-io-gcs-filesystem==0.18.0
[?25l  Downloading https://files.pythonhosted.org/packages/27/37/6cedfcc52f1d53a79a60204fc89d1f7ca099c5d3a999d4640a2fe407e91b/tensorflow_io_gcs_filesystem-0.18.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 36.8MB/s 
Installing collected packages: tensorflow-io-gcs-filesystem, tensorflow-io
Successfully installed tensorflow-io-0.18.0 tensorflow-io-gcs-filesystem-0.18.0


In [2]:
import tensorflow as tf
import tensorflow_io as tfio

## Download a sample ORC file

In [3]:
!curl -OL https://github.com/tensorflow/io/raw/master/tests/test_orc/iris.orc
!ls -l iris.orc

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   144  100   144    0     0   1655      0 --:--:-- --:--:-- --:--:--  1655
100  3328  100  3328    0     0  18592      0 --:--:-- --:--:-- --:--:-- 18592
-rw-r--r-- 1 root root 3328 Jun 25 06:15 iris.orc


## Create a dataset from the file

In [4]:
dataset = tfio.IODataset.from_orc("iris.orc", capacity=15).batch(1)

In [9]:
for sepal_length, sepal_width, petal_length, petal_width, species in dataset:
  print(sepal_length.numpy(), sepal_width.numpy(), petal_length.numpy(), petal_width.numpy(), species.numpy())
  break

[5.1] [3.5] [1.4] [0.2] [b'setosa']


## Build a model reading ORC with Keras

In [10]:
feature_cols = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
label_cols = ["species"]

# select feature columns
feature_dataset = tfio.IODataset.from_orc("iris.orc", columns=feature_cols)
# select label columns
label_dataset = tfio.IODataset.from_orc("iris.orc", columns=label_cols)

@tf.function
def species_float_conversion(x):
    if x == "virginica":
        return 1.0
    if x == "versicolor":
        return 2.0
    if x == "setosa":
        return 3.0
    return 4.0

label_dataset = label_dataset.map(species_float_conversion)
dataset = tf.data.Dataset.zip((feature_dataset, label_dataset))
dataset = dataset.batch(1)

def pack_features_vector(features, labels):
    """Pack the features into a single array."""
    features = tf.stack(list(features), axis=1)
    return features, labels

dataset = dataset.map(pack_features_vector)

model = tf.keras.Sequential(
    [
        tf.keras.layers.Dense(
            10, activation=tf.nn.relu, input_shape=(4,)
        ),  # input shape required
        tf.keras.layers.Dense(10, activation=tf.nn.relu),
        tf.keras.layers.Dense(3),
    ]
)

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.fit(dataset, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7ff39a4e5b50>