In [2]:
import os
import tensorflow as tf
import numpy as np

# Check that we have correct TensorFlow version installed
tf_version = tf.__version__
print("TensorFlow version: {}".format(tf_version))
assert "1.5" <= tf_version, "TensorFlow r1.5 or later is needed"

TensorFlow version: 1.5.0


In [4]:
tf.logging.set_verbosity(tf.logging.INFO)

train_file = "classify-train.csv"
test_file = "classify-test.csv"

In [5]:
numerical_feature_names = [
    'PctUnder18',
    'PctOver65',
    'PctFemale',
    'PctWhite',
    'PctBachelors',
    'PctDem',
    'PctGop'
]

feature_columns = [tf.feature_column.numeric_column(k) for k in numerical_feature_names]

def my_input_fn(file_path, repeat_count=200):
    def decode_csv(line):
        parsed_line = tf.decode_csv(line, [[0.],[0.],[0.],[0.],[0.],[0.],[0.],[0.]])
        label = parsed_line[-1]  # Last element is the label
        features = parsed_line[:-1] # Everything but last elements are the features
        d = dict(zip(numerical_feature_names, features)), label
        return d

    dataset = (tf.data.TextLineDataset(file_path)  # Read text file
               .map(decode_csv))  # Transform each elem by applying decode_csv fn
    dataset = dataset.shuffle(buffer_size=256)
    dataset = dataset.repeat(repeat_count)  # Repeats dataset this # times
    dataset = dataset.batch(8)  # Batch size to use
    return dataset

In [6]:
classifier = tf.estimator.LinearClassifier(feature_columns=feature_columns)

# Run training for 20 epochs (20 times through our entire dataset)
# You can experiment with this value for your own dataset
classifier.train(
    input_fn=lambda: my_input_fn(train_file, 20))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/1h/g9jk9_kx67d6g0_gyfnvk1n4008m_k/T/tmpk44nbcf2', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x120a0afd0>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/1h/g9jk9_kx67d6g0_gyfnvk1n4008m_k/T/tmpk44nbcf2/model.ckpt.
INFO:tensorflow:loss = 5.5451775, step = 1
INFO:tensorflow:global_step/sec: 570.865
INFO:tensorflow:loss = 0.29437244, step = 101 (0.176 sec)
INFO:tensorflow:global_step/sec: 844.545
INFO:tensorflow:loss = 0.2726082, 

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x120a15eb8>

In [9]:
results = classifier.evaluate(input_fn=lambda: my_input_fn(test_file, 1))

for key in sorted(results):
  print('%s: %s' % (key, results[key]))

INFO:tensorflow:Starting evaluation at 2018-02-22-18:06:33
INFO:tensorflow:Restoring parameters from /var/folders/1h/g9jk9_kx67d6g0_gyfnvk1n4008m_k/T/tmpk44nbcf2/model.ckpt-6250
INFO:tensorflow:Finished evaluation at 2018-02-22-18:06:34
INFO:tensorflow:Saving dict for global step 6250: accuracy = 0.9705882, accuracy_baseline = 0.8888889, auc = 0.99298507, auc_precision_recall = 0.9434167, average_loss = 0.1134358, global_step = 6250, label/mean = 0.11111111, loss = 0.90159357, prediction/mean = 0.14078975
accuracy: 0.9705882
accuracy_baseline: 0.8888889
auc: 0.99298507
auc_precision_recall: 0.9434167
average_loss: 0.1134358
global_step: 6250
label/mean: 0.11111111
loss: 0.90159357
prediction/mean: 0.14078975


In [10]:
# Generate predictions on 3 counties
prediction_input = {
    'PctUnder18': [23.9, 25.7, 10.6],
    'PctOver65': [17.6,24.7,15.8],
    'PctFemale': [50.0,48.5,53.5],
    'PctWhite':[0.965, 0.97, 0.75],
    'PctBachelors':[12.7, 17.0, 49.8],
    'PctDem': [0.3227832512315271, 0.09475032010243278, 0.6346801346801347],
    'PctGop': [0.6545566502463054, 0.8911651728553138, 0.3468013468013468]
}

def test_input_fn():
   dataset = tf.data.Dataset.from_tensors(prediction_input)
   return dataset

# Predict all our prediction_input
pred_results = classifier.predict(input_fn=test_input_fn)

In [11]:
# Actual values for the raw prediction data:
# 1) 23% Clinton (class of 0 for Trump)
# 2) 5% Clinton (class of 0)
# 3) 69% Clinton (class of 1)

# Iterate over predictions on raw data
for pred in enumerate(pred_results):
    print(pred[1]['probabilities'])

INFO:tensorflow:Restoring parameters from /var/folders/1h/g9jk9_kx67d6g0_gyfnvk1n4008m_k/T/tmpk44nbcf2/model.ckpt-6250
[0.9905778  0.00942222]
[9.9971288e-01 2.8714165e-04]
[0.04889648 0.95110345]
