This follows the tensorflow Linear Model tutorial
https://www.tensorflow.org/tutorials/wide/

In [6]:
# download census income dataset 
# https://archive.ics.uci.edu/ml/datasets/Census+Income
import tempfile
import urllib.request
train_file = tempfile.NamedTemporaryFile()
test_file = tempfile.NamedTemporaryFile()
urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", train_file.name)
urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", test_file.name)

('/var/folders/9c/90t50vz162ncl1c4_3pc2w9w0000gn/T/tmpqd2wzfp0',
 <http.client.HTTPMessage at 0x10985ee80>)

In [8]:
# read into pandas dataframes
import pandas as pd
COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
           "marital_status", "occupation", "relationship", "race", "gender",
           "capital_gain", "capital_loss", "hours_per_week", "native_country",
           "income_bracket"]
df_train = pd.read_csv(train_file, names=COLUMNS, skipinitialspace=True)
df_test = pd.read_csv(test_file, names=COLUMNS, skipinitialspace=True, skiprows=1)

In [12]:
# binary classification problem
# 'label' column is 1 of income is >$50k and 0 otherwise
LABEL_COLUMN = "label"
df_train[LABEL_COLUMN] = (df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
df_test[LABEL_COLUMN] = (df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)

In [13]:
# categorical = discrete values
# continuous = numerical continuous values
CATEGORICAL_COLUMNS = ["workclass", "education", "marital_status", "occupation",
                       "relationship", "race", "gender", "native_country"]
CONTINUOUS_COLUMNS = ["age", "education_num", "capital_gain", "capital_loss", "hours_per_week"]

In [35]:
# converting the data into tensors
import tensorflow as tf

def input_fn(df):
    # Creates a dictionary mapping from each continuous feature column name (k) to
    # the values of that column stored in a constant Tensor.
    continuous_cols = {k: tf.constant(df[k].values) for k in CONTINUOUS_COLUMNS}
    # Creates a dictionary mapping from each categorical feature column name (k)
    # to the values of that column stored in a tf.SparseTensor.
    categorical_cols = {k: tf.SparseTensor(
                              indices=[[i, 0] for i in range(df[k].size)],
                              values=df[k].values,
                              shape=[df[k].size, 1])
                          for k in CATEGORICAL_COLUMNS}
    # Merges the two dictionaries into one.
    #feature_cols = dict(continuous_cols.items() + categorical_cols.items())

    feature_cols = {**continuous_cols, **categorical_cols}
    
    # Converts the label column into a constant Tensor.
    label = tf.constant(df[LABEL_COLUMN].values)
    # Returns the feature columns and the label.
    return feature_cols, label

def train_input_fn():
    return input_fn(df_train)

def eval_input_fn():
    return input_fn(df_test)

In [29]:
# selecting and engineering features

# categorical columns
## known values
gender = tf.contrib.layers.sparse_column_with_keys(column_name="gender", keys=["Female", "Male"])

## unknown values
education = tf.contrib.layers.sparse_column_with_hash_bucket("education", hash_bucket_size=1000)
relationship = tf.contrib.layers.sparse_column_with_hash_bucket("relationship", hash_bucket_size=100)
marital_status = tf.contrib.layers.sparse_column_with_hash_bucket("marital_status", hash_bucket_size=100)
race = tf.contrib.layers.sparse_column_with_hash_bucket(column_name="race", hash_bucket_size=100)
workclass = tf.contrib.layers.sparse_column_with_hash_bucket("workclass", hash_bucket_size=100)
occupation = tf.contrib.layers.sparse_column_with_hash_bucket("occupation", hash_bucket_size=1000)
native_country = tf.contrib.layers.sparse_column_with_hash_bucket("native_country", hash_bucket_size=1000)

# continuous columns
age = tf.contrib.layers.real_valued_column("age")
education_num = tf.contrib.layers.real_valued_column("education_num")
capital_gain = tf.contrib.layers.real_valued_column("capital_gain")
capital_loss = tf.contrib.layers.real_valued_column("capital_loss")
hours_per_week = tf.contrib.layers.real_valued_column("hours_per_week")



In [30]:
# make continuous feature categorical through bucketization
# without this step, we can only learn [positive, negative, none] correlations between income and age
# making age categorical allows us to cope with things like retirement and salary growth in early career
age_buckets = tf.contrib.layers.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
# 10 boundaries = 11 age group buckets

In [31]:
# crossed columns
# to capture every single education-occupation combination 
# (e.g. distinguishing between education="Bachelors" AND occupation="Exec-managerial" 
#                          and education="Bachelors" AND occupation="Craft-repair")
education_x_occupation = tf.contrib.layers.crossed_column([education, occupation], hash_bucket_size=int(1e4))



In [32]:
# crossed column over more than 2 features
age_buckets_x_education_x_occupation = tf.contrib.layers.crossed_column([age_buckets, education, occupation], hash_bucket_size=int(1e6))



In [33]:
# build logistic regression model
model_dir = tempfile.mkdtemp()
m = tf.contrib.learn.LinearClassifier(feature_columns=[
  gender, native_country, education, occupation, workclass, marital_status, race,
  age_buckets, education_x_occupation, age_buckets_x_education_x_occupation],
  model_dir=model_dir)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_num_ps_replicas': 0, '_evaluation_master': '', '_task_id': 0, 'save_checkpoints_steps': None, 'keep_checkpoint_max': 5, 'save_checkpoints_secs': 600, '_environment': 'local', 'tf_random_seed': None, 'save_summary_steps': 100, '_task_type': None, 'keep_checkpoint_every_n_hours': 10000, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x114731748>, '_master': '', 'tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
}


In [36]:
# train the model
m.fit(input_fn=train_input_fn, steps=200)

Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
The default behavior of sparse_feature_cross is changing, the

<tensorflow.contrib.learn.python.learn.estimators.linear.LinearClassifier at 0x1147315f8>

In [38]:
results = m.evaluate(input_fn=eval_input_fn, steps=1)
for key in sorted(results):
    print ("%s: %s" % (key, results[key]))

Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
The default behavior of sparse_feature_cross is changing, the

INFO:tensorflow:Saving evaluation summary for step 200: accuracy = 0.834654, accuracy/baseline_label_mean = 0.236226, accuracy/threshold_0.500000_mean = 0.834654, auc = 0.879197, labels/actual_label_mean = 0.236226, labels/prediction_mean = 0.241362, loss = 0.358873, precision/positive_threshold_0.500000_mean = 0.710892, recall/positive_threshold_0.500000_mean = 0.50572
accuracy: 0.834654
accuracy/baseline_label_mean: 0.236226
accuracy/threshold_0.500000_mean: 0.834654
auc: 0.879197
global_step: 200
labels/actual_label_mean: 0.236226
labels/prediction_mean: 0.241362
loss: 0.358873
precision/positive_threshold_0.500000_mean: 0.710892
recall/positive_threshold_0.500000_mean: 0.50572

In [41]:
# Adding Regularization to Prevent Overfitting

m2 = tf.contrib.learn.LinearClassifier(feature_columns=[
  gender, native_country, education, occupation, workclass, marital_status, race,
  age_buckets, education_x_occupation, age_buckets_x_education_x_occupation],
  optimizer=tf.train.FtrlOptimizer(
    learning_rate=0.1,
    l1_regularization_strength=2.0,
    l2_regularization_strength=1.0),
  model_dir=model_dir)
m2.fit(input_fn=train_input_fn, steps=200)
results = m2.evaluate(input_fn=eval_input_fn, steps=1)
for key in sorted(results):
    print ("%s: %s" % (key, results[key]))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_num_ps_replicas': 0, '_evaluation_master': '', '_task_id': 0, 'save_checkpoints_steps': None, 'keep_checkpoint_max': 5, 'save_checkpoints_secs': 600, '_environment': 'local', 'tf_random_seed': None, 'save_summary_steps': 100, '_task_type': None, 'keep_checkpoint_every_n_hours': 10000, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11b38c978>, '_master': '', 'tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
}
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_si

INFO:tensorflow:Saving evaluation summary for step 600: accuracy = 0.827406, accuracy/baseline_label_mean = 0.236226, accuracy/threshold_0.500000_mean = 0.827406, auc = 0.860721, labels/actual_label_mean = 0.236226, labels/prediction_mean = 0.244677, loss = 0.384267, precision/positive_threshold_0.500000_mean = 0.70904, recall/positive_threshold_0.500000_mean = 0.456838
accuracy: 0.827406
accuracy/baseline_label_mean: 0.236226
accuracy/threshold_0.500000_mean: 0.827406
auc: 0.860721
global_step: 600
labels/actual_label_mean: 0.236226
labels/prediction_mean: 0.244677
loss: 0.384267
precision/positive_threshold_0.500000_mean: 0.70904
recall/positive_threshold_0.500000_mean: 0.456838

In [None]:
# ^ accuracy is worse with the regularization step?