##### Copyright 2018 The TensorFlow Authors.



In [0]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Load CSV with tf.data

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://www.tensorflow.org/alpha/tutorials/load_data/text"><img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/load_data/text.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/docs/blob/master/site/en/tutorials/load_data/text.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
</table>

## Setup

In [0]:
!pip install tensorflow==2.0.0-alpha0

In [0]:
from __future__ import absolute_import, division, print_function

import requests

import tensorflow as tf
import tensorflow_datasets as tfds


In [0]:
!ls

In [0]:
TRAIN_DATA_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
TEST_DATA_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"

In [0]:
# CSV columns in the input file.
CSV_COLUMNS = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
               'marital_status', 'occupation', 'relationship', 'race', 'gender',
               'capital_gain', 'capital_loss', 'hours_per_week',
               'native_country', 'income_bracket']

In [0]:
CATEGORIES = {
    'education': ['Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school',
                  'Assoc-acdm', 'Assoc-voc', '9th', '7th-8th', '12th',
                  'Masters', '1st-4th', '10th', 'Doctorate', '5th-6th',
                  'Preschool'],
    'marital_status': ['Married-civ-spouse', 'Divorced', 'Never-married',
                       'Separated', 'Widowed', 'Married-spouse-absent',
                       'Married-AF-spouse'],
    'relationship': ['Wife', 'Own-child', 'Husband', 'Not-in-family',
                     'Other-relative', 'Unmarried'],
    'workclass': ['Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov',
                  'Local-gov', 'State-gov', 'Without-pay', 'Never-worked'],
    'occupation': ['Tech-support', 'Craft-repair', 'Other-service', 'Sales',
                   'Exec-managerial', 'Prof-specialty', 'Handlers-cleaners',
                   'Machine-op-inspct', 'Adm-clerical', 'Farming-fishing',
                   'Transport-moving', 'Priv-house-serv', 'Protective-serv',
                   'Armed-Forces'],
    'gender': ['Male', 'Female'],
    'race': ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other',
             'Black'],
}

In [0]:

MEANS = {
    'age': 38.64358543876172,
    'education_num': 10.078088530363212,
    'capital_gain': 1079.0676262233324,
    'capital_loss': 87.50231358257237,
    'hours_per_week': 40.422382375824085,
}

In [0]:
LABELS = ['<=50K', '>50K']
LABEL_COLUMN = 'income_bracket'

In [0]:
USED_COLUMNS = ['age', 'workclass', 'education', 'education_num',
                'marital_status', 'occupation', 'relationship', 'race',
                'gender', 'capital_gain', 'capital_loss', 'hours_per_week',
                'income_bracket']

USED_COLUMN_DEFAULTS = [[0], [''], [''], [0], [''], [''], [''], [''], [''], [0],
                        [0], [0], ['']]

In [0]:
BATCH_SIZE = 64

INPUT_SIZE = 63
CLASS_SIZE = 2

MAX_LEN = 100

In [0]:
def get_dataset(path):
  dataset = tf.data.experimental.make_csv_dataset(
      path,
      batch_size=BATCH_SIZE,
      column_names=CSV_COLUMNS,
      label_name=LABEL_COLUMN,
      select_columns=USED_COLUMNS,
      column_defaults=USED_COLUMN_DEFAULTS,
      na_value="?",
      num_epochs=1,
      ignore_errors=True)
  dataset = dataset.map(preprocess)
  return dataset

In [0]:
def preprocess(features, label):
  
  def process_categorical_data(data, categories):
    # Remove leading ' '.
    data = tf.strings.regex_replace(data, '^ ', '')
    # Remove trailing '.'.
    data = tf.strings.regex_replace(data, r'\.$', '')
    # Map category to a number.
    data = tf.cast(tf.equal(categories, tf.reshape(data, [-1, 1])), tf.float32)
    return data
  
  def process_continuous_data(data, mean):
    # Normalize data
    data = tf.cast(data, tf.float32) * 1/(2*mean)
    return tf.reshape(data, [-1, 1])

  # Process categorial features.
  for feature in [
      'education', 'marital_status', 'relationship', 'workclass', 'occupation',
      'gender', 'race'
  ]:
    features[feature] = process_categorical_data(features[feature],
                                                 CATEGORIES[feature])

  # Process continuous features.
  for feature in [
      'age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week'
  ]:
    features[feature] = process_continuous_data(features[feature],
                                                MEANS[feature])

  # Process the label.
  label = process_categorical_data(label, LABELS)

  features = tf.concat([
      features['education'], features['marital_status'],
      features['relationship'], features['workclass'], features['occupation'],
      features['gender'], features['race'], features['age'],
      features['education_num'], features['capital_gain'],
      features['capital_loss'], features['hours_per_week']
  ], 1)

  return features, label

In [0]:
def get_model(input_dim, labels_dim, hidden_units=[100], learning_rate=0.01):
  """Create a Keras Sequential model with layers.

  Args:
    input_dim: (int) Input dimensions for input layer.
    labels_dim: (int) Label dimensions for input layer.
    hidden_units: [int] the layer sizes of the DNN (input layer first)
    learning_rate: (float) the learning rate for the optimizer.

  Returns:
    A Keras model.
  """

  inputs = tf.keras.Input(shape=(input_dim,))
  x = inputs
  for units in hidden_units:
    x = tf.keras.layers.Dense(units, activation=tf.keras.backend.relu)(x)
  outputs = tf.keras.layers.Dense(labels_dim, activation='sigmoid')(x)

  model = tf.keras.Model(inputs, outputs)
  model.compile(
      loss='categorical_crossentropy',
      optimizer=tf.keras.optimizers.RMSprop(learning_rate),
      metrics=['accuracy'])
  return model

In [0]:
def main():
  
  
  train_file = tf.keras.utils.get_file("adults.data", TRAIN_DATA_URL)
  test_file = tf.keras.utils.get_file("adults.test", TEST_DATA_URL)

  train_data = get_dataset(train_file)
  test_data = get_dataset(test_file)
  model = get_model(INPUT_SIZE, CLASS_SIZE)

  model.fit(train_data, epochs=20)
  model.evaluate(test_data)

In [0]:
main()