In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import sys
import tempfile

import pandas as pd
from six.moves import urllib
import tensorflow as tf

In [2]:
CSV_COLUMNS = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "gender",
    "capital_gain", "capital_loss", "hours_per_week", "native_country",
    "income_bracket"
]

In [3]:
gender = tf.feature_column.categorical_column_with_vocabulary_list(
    "gender", ["Female", "Male"])
education = tf.feature_column.categorical_column_with_vocabulary_list(
    "education", [
        "Bachelors", "HS-grad", "11th", "Masters", "9th",
        "Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th",
        "Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th",
        "Preschool", "12th"
    ])
marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
    "marital_status", [
        "Married-civ-spouse", "Divorced", "Married-spouse-absent",
        "Never-married", "Separated", "Married-AF-spouse", "Widowed"
    ])
relationship = tf.feature_column.categorical_column_with_vocabulary_list(
    "relationship", [
        "Husband", "Not-in-family", "Wife", "Own-child", "Unmarried",
        "Other-relative"
    ])
workclass = tf.feature_column.categorical_column_with_vocabulary_list(
    "workclass", [
        "Self-emp-not-inc", "Private", "State-gov", "Federal-gov",
        "Local-gov", "?", "Self-emp-inc", "Without-pay", "Never-worked"
    ])

In [4]:
# To show an example of hashing:
occupation = tf.feature_column.categorical_column_with_hash_bucket(
    "occupation", hash_bucket_size=1000)
native_country = tf.feature_column.categorical_column_with_hash_bucket(
    "native_country", hash_bucket_size=1000)

In [5]:
# Continuous base columns.
age = tf.feature_column.numeric_column("age")
education_num = tf.feature_column.numeric_column("education_num")
capital_gain = tf.feature_column.numeric_column("capital_gain")
capital_loss = tf.feature_column.numeric_column("capital_loss")
hours_per_week = tf.feature_column.numeric_column("hours_per_week")

In [6]:
# Transformations.
age_buckets = tf.feature_column.bucketized_column(
    age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

In [7]:
# Wide columns and deep columns.
base_columns = [
    gender, education, marital_status, relationship, workclass, occupation,
    native_country, age_buckets,
]

In [8]:
crossed_columns = [
    tf.feature_column.crossed_column(
        ["education", "occupation"], hash_bucket_size=1000),
    tf.feature_column.crossed_column(
        [age_buckets, "education", "occupation"], hash_bucket_size=1000),
    tf.feature_column.crossed_column(
        ["native_country", "occupation"], hash_bucket_size=1000)
]

In [9]:
deep_columns = [
    tf.feature_column.indicator_column(workclass),
    tf.feature_column.indicator_column(education),
    tf.feature_column.indicator_column(gender),
    tf.feature_column.indicator_column(relationship),
    # To show an example of embedding
    tf.feature_column.embedding_column(native_country, dimension=8),
    tf.feature_column.embedding_column(occupation, dimension=8),
    age,
    education_num,
    capital_gain,
    capital_loss,
    hours_per_week,
]

In [10]:
def maybe_download(train_data, test_data):
    if train_data:
        df_train = train_data
    else:
        train_file = tempfile.NamedTemporaryFile(delete=False)
        urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",train_file.name)  # pylint: disable=line-too-long
        df_train = train_file.name
        train_file.close()
        print("Training data is downloaded to %s" % df_train)
    
    if test_data:
        df_test = test_data
    else:
        test_file = tempfile.NamedTemporaryFile(delete=False)
        urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",test_file.name)  # pylint: disable=line-too-long
        df_test = test_file.name
        test_file.close()
        print("Test data is downloaded to %s"% df_test)
    
    return df_train, df_test

In [11]:
def build_estimator(model_dir, model_type):
    if model_type =="wide":
        m = tf.estimator.LinearClassifier(model_dir=model_dir, feature_columns=base_columns + crossed_columns)
    elif model_type == "deep":
        m = tf.estimator.DNNClassifier(model_dir=model_dir,feature_columns=deep_columns,hidden_units=[100, 50])
    else:
        m = tf.estimator.DNNLinearCombinedClassifier(model_dir=model_dir,linear_feature_columns=crossed_columns,dnn_feature_columns=deep_columns,dnn_hidden_units=[100, 50])
    return m

In [12]:
def input_fn(data_file, num_epochs, shuffle):
    df_data = pd.read_csv(
        tf.gfile.Open(data_file),
        names=CSV_COLUMNS,
        skipinitialspace=True,
        engine="python",
        skiprows=1)

    df_data = df_data.dropna(how="any", axis=0)
    labels = df_data["income_bracket"].apply(lambda x: ">50K" in x).astype(int)
    return tf.estimator.inputs.pandas_input_fn(
        x=df_data,
        y=labels,
        batch_size=100,
        num_epochs=num_epochs,
        shuffle=shuffle,
        num_threads=5)

In [13]:
def train_and_eval(model_dir, model_type, train_steps, train_data, test_data):
    df_train, df_test = maybe_download(train_data, test_data)
    model_dir = tempfile.mkdtemp() if not model_dir else model_dir
    
    m = build_estimator(model_dir, model_type)
    # set num_epochs to None to get infinite stream of data.
    m.train(input_fn=input_fn(df_train, num_epochs=200, shuffle=True),steps=20000)
    # set steps to None to run evaluation until all data consumed.
    results = m.evaluate(input_fn=input_fn(df_test, num_epochs=1, shuffle=False),steps=None)
    print("model directory = %s" % model_dir)
    for key in sorted(results):
        print("%s: %s" % (key, results[key]))

In [14]:
FLAGS = None

In [15]:
def main(_):
    train_and_eval(FLAGS.model_dir, FLAGS.model_type, FLAGS.train_steps,FLAGS.train_data, FLAGS.test_data)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.register("type", "bool", lambda v: v.lower() == "true")
    parser.add_argument("--model_dir",type=str,default="",help="Base directory for output models.")
    parser.add_argument("--model_type",type=str,default="wide_n_deep",help="Valid model types: {'wide', 'deep', 'wide_n_deep'}.")
    parser.add_argument("--train_steps",type=int,default=2000,help="Number of training steps.")
    parser.add_argument("--train_data",type=str,default="",help="Path to the training data.")
    parser.add_argument("--test_data",type=str,default="",help="Path to the test data.")
    FLAGS, unparsed = parser.parse_known_args()
    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

Training data is downloaded to C:\Users\MAYANK\AppData\Local\Temp\tmpmosz6liw
Test data is downloaded to C:\Users\MAYANK\AppData\Local\Temp\tmpdgdp8g29
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\MAYANK\\AppData\\Local\\Temp\\tmpamei1ixf', '_keep_checkpoint_max': 5, '_session_config': None, '_save_checkpoints_secs': 600, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_save_summary_steps': 100, '_tf_random_seed': 1, '_save_checkpoints_steps': None}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\MAYANK\AppData\Local\Temp\tmpamei1ixf\model.ckpt.
INFO:tensorflow:step = 1, loss = 739.915
INFO:tensorflow:global_step/sec: 71.4245
INFO:tensorflow:step = 101, loss = 40.0536 (1.416 sec)
INFO:tensorflow:global_step/sec: 83.2212
INFO:tensorflow:step = 201, loss = 53.9441 (1.202 sec)
INFO:tensorflow:global_step/sec: 69.0634
INFO:tensorflow:step = 301, loss = 61.2738 (1.432

INFO:tensorflow:step = 7301, loss = 40.2734 (1.281 sec)
INFO:tensorflow:global_step/sec: 74.7861
INFO:tensorflow:step = 7401, loss = 36.0897 (1.337 sec)
INFO:tensorflow:global_step/sec: 79.807
INFO:tensorflow:step = 7501, loss = 28.0104 (1.253 sec)
INFO:tensorflow:global_step/sec: 79.015
INFO:tensorflow:step = 7601, loss = 36.9204 (1.270 sec)
INFO:tensorflow:global_step/sec: 75.1863
INFO:tensorflow:step = 7701, loss = 38.7753 (1.326 sec)
INFO:tensorflow:global_step/sec: 79.7015
INFO:tensorflow:step = 7801, loss = 37.7627 (1.270 sec)
INFO:tensorflow:global_step/sec: 72.2494
INFO:tensorflow:step = 7901, loss = 34.5781 (1.384 sec)
INFO:tensorflow:global_step/sec: 74.1358
INFO:tensorflow:step = 8001, loss = 48.3619 (1.333 sec)
INFO:tensorflow:global_step/sec: 76.9375
INFO:tensorflow:step = 8101, loss = 28.1469 (1.315 sec)
INFO:tensorflow:global_step/sec: 77.8095
INFO:tensorflow:step = 8201, loss = 28.5196 (1.285 sec)
INFO:tensorflow:global_step/sec: 77.7825
INFO:tensorflow:step = 8301, los

INFO:tensorflow:global_step/sec: 76.3676
INFO:tensorflow:step = 15801, loss = 32.0513 (1.313 sec)
INFO:tensorflow:global_step/sec: 79.2153
INFO:tensorflow:step = 15901, loss = 33.9352 (1.258 sec)
INFO:tensorflow:global_step/sec: 80.0424
INFO:tensorflow:step = 16001, loss = 36.834 (1.266 sec)
INFO:tensorflow:global_step/sec: 72.9835
INFO:tensorflow:step = 16101, loss = 33.4633 (1.354 sec)
INFO:tensorflow:global_step/sec: 78.7801
INFO:tensorflow:step = 16201, loss = 31.1866 (1.269 sec)
INFO:tensorflow:global_step/sec: 77.8862
INFO:tensorflow:step = 16301, loss = 33.8091 (1.287 sec)
INFO:tensorflow:global_step/sec: 78.1657
INFO:tensorflow:step = 16401, loss = 35.6615 (1.276 sec)
INFO:tensorflow:global_step/sec: 76.878
INFO:tensorflow:step = 16501, loss = 38.3246 (1.319 sec)
INFO:tensorflow:global_step/sec: 77.7856
INFO:tensorflow:step = 16601, loss = 32.8363 (1.267 sec)
INFO:tensorflow:global_step/sec: 71.3985
INFO:tensorflow:step = 16701, loss = 27.3613 (1.401 sec)
INFO:tensorflow:global

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
