<a href="https://colab.research.google.com/github/sxqqslf/TensorflowDaily/blob/master/MLAtProductionScale/BuildALinearModelWithEstimators.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import tensorflow as tf
import tensorflow.feature_column as fc

import os
import sys

import matplotlib.pyplot as plt
from IPython.display import clear_output

tf.enable_eager_execution()

In [0]:
!pip install -q requests
!git clone --depth 1 https://github.com/tensorflow/models

In [0]:
models_path = os.path.join(os.getcwd(), 'models')
sys.path.append(models_path)

In [0]:
from official.wide_deep import census_dataset
from official.wide_deep import census_main

census_dataset.download('/tmp/census_data/')

In [0]:
if "PYTHONPATH" in os.environ:
  os.environ['PYTHONPATH'] += os.pathsep + models_path
else:
  os.environ['PYTHONPATH'] = models_path

In [0]:
!python -m official.wide_deep.census_main --help

In [9]:
!ls /tmp/census_data/

adult.data  adult.test


In [0]:
train_file = "/tmp/census_data/adult.data"
test_file = "/tmp/census_data/adult.test"

import pandas

train_df = pandas.read_csv(train_file, header=None, names=census_dataset._CSV_COLUMNS)
test_df = pandas.read_csv(test_file, header=None, names=census_dataset._CSV_COLUMNS)

train_df.head()

In [19]:
def easy_input_function(df, label_key, num_epochs, shuffle, batch_size):
  label = df[label_key]
  ds = tf.data.Dataset.from_tensor_slices((dict(df), label))
  
  if shuffle:
    ds = ds.shuffle(10000)
    
  ds = ds.batch(batch_size).repeat(num_epochs)
  
  return ds

ds = easy_input_function(train_df, label_key='income_bracket', num_epochs=5, shuffle=False, batch_size=10)

for feature_batch, label_batch in ds.take(1):
  print('Some feature keys:', list(feature_batch.keys())[:5])
  print()
  print('A batch of Ages  :', feature_batch['age'])
  print()
  print('A batch of Labels:', label_batch)

Some feature keys: ['age', 'workclass', 'fnlwgt', 'education', 'education_num']

A batch of Ages  : tf.Tensor([39 50 38 53 28 37 49 52 31 42], shape=(10,), dtype=int32)

A batch of Labels: tf.Tensor(
[b'<=50K' b'<=50K' b'<=50K' b'<=50K' b'<=50K' b'<=50K' b'<=50K' b'>50K'
 b'>50K' b'>50K'], shape=(10,), dtype=string)


In [20]:
ds = census_dataset.input_fn(train_file, num_epochs=5, shuffle=True, batch_size=10)

for feature_batch, label_batch in ds.take(1):
  print('Feature keys:', list(feature_batch.keys())[:5])
  print()
  print('Age batch   :', feature_batch['age'])
  print()
  print('Label batch :', label_batch)

INFO:tensorflow:Parsing /tmp/census_data/adult.data


I1105 07:03:51.648764 139955084638080 tf_logging.py:115] Parsing /tmp/census_data/adult.data


Feature keys: ['age', 'workclass', 'fnlwgt', 'education', 'education_num']

Age batch   : tf.Tensor([26 42 19 37 65 36 37 30 35 24], shape=(10,), dtype=int32)

Label batch : tf.Tensor([False False False False  True  True False False False False], shape=(10,), dtype=bool)


In [0]:
import functools 

train_inpf = functools.partial(census_dataset.input_fn, train_file, num_epochs=2, shuffle=True, batch_size=64)
test_inpf = functools.partial(census_dataset.input_fn, test_file, num_epochs=1, shuffle=False, batch_size=64)

In [0]:
age = fc.numeric_column('age')
fc.input_layer(feature_batch, [age]).numpy()

In [25]:
classifier = tf.estimator.LinearClassifier(feature_columns=[age])
classifier.train(train_inpf)
result = classifier.evaluate(test_inpf)

clear_output()
print(result)

{'accuracy': 0.7631595, 'accuracy_baseline': 0.76377374, 'auc': 0.67813593, 'auc_precision_recall': 0.31129786, 'average_loss': 0.5239739, 'label/mean': 0.23622628, 'loss': 33.45419, 'precision': 0.29166666, 'prediction/mean': 0.22685227, 'recall': 0.0018200728, 'global_step': 1018}


In [0]:
education_num = fc.numeric_column('education_num')
capital_gain = fc.numeric_column('capital_gain')
capital_loss = fc.numeric_column('capital_loss')
hours_per_week = fc.numeric_column('hours_per_week')

my_numeric_columns = [age, education_num, capital_gain, capital_loss, hours_per_week]

fc.input_layer(feature_batch, my_numeric_columns).numpy()

In [27]:
classifier = tf.estimator.LinearClassifier(feature_columns=my_numeric_columns)
classifier.train(train_inpf)

result = classifier.evaluate(test_inpf)

clear_output()

for key, value in sorted(result.items()):
  print('%s: %s' % (key, value))

accuracy: 0.7815859
accuracy_baseline: 0.76377374
auc: 0.6894037
auc_precision_recall: 0.48563907
average_loss: 2.034632
global_step: 1018
label/mean: 0.23622628
loss: 129.90527
precision: 0.56971157
prediction/mean: 0.23792283
recall: 0.30811232


In [0]:
relationship = fc.categorical_column_with_vocabulary_list(
  'relationship',
  ['Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried', 'Other-relative'])
fc.input_layer(feature_batch, [age, fc.indicator_column(relationship)])

In [0]:
occupation = tf.feature_column.categorical_column_with_hash_bucket(
  'occupation', hash_bucket_size=1000)

for item in feature_batch['occupation'].numpy():
  print(item.decode())
  
occupation_result = fc.input_layer(feature_batch, [fc.indicator_column(occupation)])
occupation_result.numpy().shape

tf.argmax(occupation_result, axis=1).numpy()

In [0]:
education = fc.categorical_column_with_vocabulary_list(
  'education', [
      'Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
        'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
        '5th-6th', '10th', '1st-4th', 'Preschool', '12th'])

marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
    'marital_status', [
        'Married-civ-spouse', 'Divorced', 'Married-spouse-absent',
        'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed'])

workclass = tf.feature_column.categorical_column_with_vocabulary_list(
    'workclass', [
        'Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov',
        'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'])

my_categorical_columns = [relationship, occupation, education, marital_status, workclass]

In [34]:
classifier = tf.estimator.LinearClassifier(feature_columns=my_numeric_columns+my_categorical_columns)
classifier.train(train_inpf)
result = classifier.evaluate(test_inpf)

clear_output()

for key, value in sorted(result.items()):
  print('%s: %s' % (key, value))

accuracy: 0.8307229
accuracy_baseline: 0.76377374
auc: 0.87697464
auc_precision_recall: 0.65071946
average_loss: 2.0388513
global_step: 1018
label/mean: 0.23622628
loss: 130.17467
precision: 0.6612426
prediction/mean: 0.23680677
recall: 0.58112323


In [0]:
age_buckets = fc.bucketized_column(
  age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

fc.input_layer(feature_batch, [age, age_buckets]).numpy()

In [0]:
education_x_occupation = fc.crossed_column(
  ['education', 'occupation'], hash_bucket_size=1000)

age_buckets_x_education_x_occupation = fc.crossed_column(
  [age_buckets, 'education', 'occupation'], hash_bucket_size=1000)

In [0]:
import tempfile

base_columns = [
    education, marital_status, relationship, workclass, occupation,
    age_buckets,
]

crossed_columns = [
    education_x_occupation, age_buckets_x_education_x_occupation,
]

model = tf.estimator.LinearClassifier(
  model_dir=tempfile.mkdtemp(),
  feature_columns=base_columns + crossed_columns,
  optimizer=tf.train.FtrlOptimizer(learning_rate=0.1))

In [0]:
train_inpf = functools.partial(census_dataset.input_fn, train_file, num_epochs=40, shuffle=True, batch_size=64)

model.train(train_inpf) 
clear_output()

results = model.evaluate(test_inpf) 
clear_output()

for key, value in sorted(results.items()): 
  print('%s: %0.2f' % (key, value))

accuracy: 0.83
accuracy_baseline: 0.76
auc: 0.88
auc_precision_recall: 0.69
average_loss: 0.36
global_step: 40702.00
label/mean: 0.24
loss: 22.80
precision: 0.69
prediction/mean: 0.24
recall: 0.55


In [0]:
import numpy as np

predict_df = test_df[:20].copy()

pred_iter = model.predict(
    lambda:easy_input_function(predict_df, label_key='income_bracket',
                               num_epochs=1, shuffle=False, batch_size=10))

classes = np.array(['<=50K', '>50K'])
pred_class_id = []

for pred_dict in pred_iter:
  pred_class_id.append(pred_dict['class_ids'])

predict_df['predicted_class'] = classes[np.array(pred_class_id)]
predict_df['correct'] = predict_df['predicted_class'] == predict_df['income_bracket']

clear_output()

predict_df[['income_bracket','predicted_class', 'correct']]

In [0]:
model_l1 = tf.estimator.LinearClassifier(
  feature_columns=base_columns + crossed_columns,
  optimizer=tf.train.FtrlOptimizer(
    learning_rate=0.1,
    l1_regularization_strength=10.0,
    l2_regularization_strength=0.0))

model_l1.train(train_inpf)

results = model_l1.evaluate(test_inpf)
clear_output()

for key, value in sorted(results.items()):
  print('%s: %0.2f' % (key, value))

accuracy: 0.84
accuracy_baseline: 0.76
auc: 0.88
auc_precision_recall: 0.69
average_loss: 0.35
global_step: 20351.00
label/mean: 0.24
loss: 22.47
precision: 0.69
prediction/mean: 0.24
recall: 0.55


In [0]:
model_l2 = tf.estimator.LinearClassifier(
  feature_columns=base_columns + crossed_columns,
  optimizer=tf.train.FtrlOptimizer(
    learning_rate=0.1,
    l1_regularization_strength=0.0,
    l2_regularization_strength=10.0))

model_l2.train(train_inpf)
results = model_l2.evaluate(test_inpf)
clear_output()

for key, value in sorted(results.items()):
  print('%s: %0.2f' % (key, value))

accuracy: 0.84
accuracy_baseline: 0.76
auc: 0.88
auc_precision_recall: 0.69
average_loss: 0.35
global_step: 20351.00
label/mean: 0.24
loss: 22.46
precision: 0.68
prediction/mean: 0.24
recall: 0.57


In [0]:
def get_flat_weights(model):
  weight_names = [
      name for name in model.get_variable_names()
      if "linear_model" in name and "Ftrl" not in name]
  
  weight_values = [model.get_variable_value(name) for name in weight_names]
  weights_flat = np.concatenate([item.flatten() for item in weight_values], axis=0)
  
  return weights_flat

weights_flat = get_flat_weights(model)
weights_flat_l1 = get_flat_weights(model_l1)
weights_flat_l2 = get_flat_weights(model_l2)

In [0]:
weight_mask = weights_flat != 0

weights_base = weights_flat[weight_mask]
weights_l1 = weights_flat_l1[weight_mask]
weights_l2 = weights_flat_l2[weight_mask]

plt.figure()
_ = plt.hist(weights_base, bins=np.linspace(-3, 3, 30))
plt.title('Base Model')
plt.ylim([0, 500])

plt.figure()
_ = plt.hist(weights_l1, bins=np.linspace(-3, 3, 30))
plt.title('L1 - Regularization')
plt.ylim([0, 500])

plt.figure()
_ = plt.hist(weights_l2, bins=np.linspace(-3, 3, 30))
plt.title('L2 - Regularization')
plt.ylim([0, 500])
