## Ready For Wide-N-Deep Classification

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import time

import numpy as np
import tensorflow as tf

  return f(*args, **kwds)


In [2]:
# Set to INFO for tracking training
tf.logging.set_verbosity(tf.logging.INFO)

print("Using Tensorflow version %s" % (tf.__version__))

Using Tensorflow version 1.10.1


In [3]:
CATEGORICAL_COLUMNS = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'native_country']

# columns of the input csv
COLUMNS = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 
           'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income_bracket']

# feature columns for input into the model
FEATURE_COLUMNS = ['age', 'workclass', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race',
                  'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country']

## Data inspection

In [4]:
import pandas as pd

"""
Feature Description

age :나이
workclass : 소속 근무지 고용주의 유형
fnlwgt : final weight, 인구 조사 샘플링 예측 지수. sample 데이터의 weight를 나타냄.
education : 교육 수준
education_num : 교육 수준을 numerical하게 표현
marital_status : 결혼 상태
occupation : 직종
relationship : 거주가족관계
race : 인종
gender : 성별
capital_gain : 수입 기록
capital_loss : 지출 기록
hours_per_week : 주당 근무시간
native_country : 모국
income_bracket : 소득 계층
"""

df = pd.read_csv('adult.data.csv', header=None, names=COLUMNS)

In [5]:
df.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [7]:
df.shape

(32561, 15)

In [8]:
df.corr() # numerical features correlation

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
age,1.0,-0.076646,0.036527,0.077674,0.057775,0.068756
fnlwgt,-0.076646,1.0,-0.043195,0.000432,-0.010252,-0.018768
education_num,0.036527,-0.043195,1.0,0.12263,0.079923,0.148123
capital_gain,0.077674,0.000432,0.12263,1.0,-0.031615,0.078409
capital_loss,0.057775,-0.010252,0.079923,-0.031615,1.0,0.054256
hours_per_week,0.068756,-0.018768,0.148123,0.078409,0.054256,1.0


## Input file parsing

example:

{ 

  'age':            [ 39, 50, 38, 53, 28, … ], 
  
  'marital_status': [ 'Married-civ-spouse', 'Never-married', 'Widowed', 'Widowed' … ],
  
   ...
   
  'gender':           ['Male', 'Female', 'Male', 'Male', 'Female',, … ], 
  
} , 

[ 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1]

In [9]:
# BATCH_SIZE = 40
BATCH_SIZE = 1

def generate_input_fn(filename, batch_size=BATCH_SIZE):
    """
    Function to generate input data by preprocessing
    input is filename, output is data for traing, test
    """
    def _input_fn():
        filename_queue = tf.train.string_input_producer([filename])
        reader = tf.TextLineReader()
        
        # reads out batch size number of lines : file queue로, 데이터를 batch로 read
        key, rows = reader.read_up_to(filename_queue, num_records=batch_size)
        
        # record_defaults should match the datatypes of each respectively : 각 column별 default값 지정
        record_defaults = [[0], [" "], [0], [" "], [0],
                          [" "], [" "], [" "], [" "], [" "],
                          [0], [0], [0], [" "], [" "]]
        
        # add new axis == add new dimension
        # ex) a = np.array([1, 2, 3, 4, 5])
        # a = a[:, np.newaxis]
        # array([[1],
        #       [2],
        #       [3],
        #       [4],
        #       [5]])
        rows = rows[:, np.newaxis]
        
        # Decode csv data that was just read out : file queue, reader로 읽어들이는 데이터는 인코딩된 데이터이므로 디코딩이 필요함.
        columns = tf.decode_csv(rows, record_defaults=record_defaults)
        print(columns)
        
        # feature mapping to dictionary
        all_columns = dict(zip(COLUMNS, columns))
        
        # income_bracket is y_label in our data
        income_bracket = all_columns.pop('income_bracket')
        
        # remove the fnlwgt key, which is not used. --> 의미없는 데이터
        all_columns.pop('fnlwgt', 'fnlwgt key not found')
        
        # the remaining columns are our features
        features = all_columns
        
        # convert y label as binary (>50K is 1, <=50K is 0)
        labels = tf.to_int32(tf.equal(income_bracket, " >50K"))
        
        print(features, labels)
        
        return features, labels
    return _input_fn

print('input function configured')

input function configured


## Basic Feature Setting

- wide n deep 학습법은 일반적으로 sparse한 입력 feature를 가진(1) 데이터의 회귀 및 분류문제에 적합하다.
- 학습에 앞서, sparse feature의 특징을 가진 데이터를 인코딩한다.

(1) : 많은 수의 카테고리적 특징을 가진, 길이가 긴 one-hot 인코딩을 해야하는 카테고리 피처

In [10]:
# The layers module contains many utilities for creating feature columns.
from tensorflow.contrib import layers

# Sparse columns : unique count가 5개 이하인 피처는 keys로 sparse mapping
gender = layers.sparse_column_with_keys(column_name='gender',
                                       keys=['female', 'male'])
race = layers.sparse_column_with_keys(column_name='race',
                                       keys=["Amer-Indian-Eskimo",
                                            "Asian-Pac-Islander",
                                            "Black", "Other",
                                            "White"])

# 5개 초과는 hash bucket으로 매핑
education = layers.sparse_column_with_hash_bucket("education", hash_bucket_size=1000)
marital_status = layers.sparse_column_with_hash_bucket("marital_status", hash_bucket_size=100)
relationship = layers.sparse_column_with_hash_bucket("relationship", hash_bucket_size=100)
workclass = layers.sparse_column_with_hash_bucket("workclass", hash_bucket_size=100)
occupation = layers.sparse_column_with_hash_bucket("occupation", hash_bucket_size=1000)
native_country = layers.sparse_column_with_hash_bucket("native_country", hash_bucket_size=1000)

print('sparse columns configured')

sparse columns configured


- numerical columns

In [11]:
# Continuous base columns.
age = layers.real_valued_column("age")
education_num = layers.real_valued_column("education_num")
capital_gain = layers.real_valued_column("capital_gain")
capital_loss = layers.real_valued_column("capital_loss")
hours_per_week = layers.real_valued_column("hours_per_week")

print('continuous columns configured')

continuous columns configured


## Feature engineering

#### transformations
- bucketizing : numerical 데이터를 categorical 데이터로 변환하는것. feature crossing을 위해서 bucketizing 한다.
- feature crossing : 모델 학습 시, 특정 column들을 pairing 하는것을 말한다. 독립적인 피처로 존재하는 것 보다 합쳐진게 더 의미가 있는 경우에 사용한다.
- wide n deep 논문에서의 feature cross는 이것을 의미한다.

For example, crossing education and occupation would enable the model to learn about:

education="Bachelors" AND occupation="Exec-managerial"

or perhaps

education="Bachelors" AND occupation="Craft-repair"

In [12]:
# Transformations

# age feature bucketizing
age_buckets = layers.bucketized_column(
    age, boundaries=[ 18, 25, 30, 35, 40, 45, 50, 55, 60, 65 ])

# education X occupation Crossing : 교육 수준과 직종을 크로스
education_occupation = layers.crossed_column(
    [education, occupation], hash_bucket_size=int(1e4))

# 3개를 크로스
age_race_occupation = layers.crossed_column(
    [age_buckets, race, occupation], hash_bucket_size=int(1e6))

# 2개를 크로스
country_occupation = layers.crossed_column(
    [native_country, occupation], hash_bucket_size=int(1e4))

print('Transformations complete')

Transformations complete


#### Grouping Wide Columns, Deep Columns
- wide columns are very sparse : all hashed columns, crossed columns
- deep columns are automatically crossing or regularization or something like that.
- deep columns 에서 embedding_column 레이어로 sparse한 feature들을 8차원으로 임베딩함(학습의 일부).

In [13]:
# wide columns
wide_columns = [gender, race, native_country,
      education, occupation, workclass,
      marital_status, relationship,
      age_buckets, education_occupation,
      age_race_occupation, country_occupation]

# deep columns
deep_columns = [
  layers.embedding_column(workclass, dimension=8),
  layers.embedding_column(education, dimension=8),
  layers.embedding_column(marital_status, dimension=8),
  layers.embedding_column(gender, dimension=8),
  layers.embedding_column(relationship, dimension=8),
  layers.embedding_column(race, dimension=8),
  layers.embedding_column(native_country, dimension=8),
  layers.embedding_column(occupation, dimension=8),
  age,
  education_num,
  capital_gain,
  capital_loss,
  hours_per_week,
]

print('wide and deep columns configured')

wide and deep columns configured


## Modeling

In [14]:
from tensorflow.contrib import learn

#### 1. Wide Model

In [15]:
def wide_classification_model():
    model_dir = 'models/model_' + 'linear' + str(int(time.time()))
    print("model directory = %s" % model_dir)
    
    # define classifier
    model = learn.LinearClassifier(
#         model_dir=model_dir,
        feature_columns=wide_columns
    )
    
    # set dataset
    train_dataset = str("adult.data.csv")
    
    # fit model
    model.fit(input_fn=generate_input_fn(train_dataset, BATCH_SIZE), steps=1000)
    print("training finish!")
    
    return model

In [16]:
%%time

model1 = wide_classification_model()

model directory = models/model_linear1544058692
Instructions for updating:
Please switch to tf.contrib.estimator.*_head.
Instructions for updating:
Please replace uses of any Estimator from tf.contrib.learn with an Estimator from tf.estimator.*
Instructions for updating:
When switching to tf.estimator.Estimator, use tf.estimator.RunConfig instead.
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11f62c470>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_train_distribute': None, '_device_fn': None, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_

#### 2. Deep Model

In [17]:
def deep_classification_model():
    model_dir = 'models/model_' + 'deep' + str(int(time.time()))
    print("model directory = %s" % model_dir)
    
    # define classifier
    model = learn.DNNClassifier(
        model_dir=model_dir,
        feature_columns=deep_columns,
        hidden_units=[100,70,50,25]
    )
    
    # set dataset
    train_dataset = str("adult.data.csv")
    
    # fit model
    model.fit(input_fn=generate_input_fn(train_dataset, BATCH_SIZE), steps=1000)
    print("training finish!")
    
    return model

In [18]:
%%time

model2 = deep_classification_model()

model directory = models/model_deep1544058705
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1242924e0>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_train_distribute': None, '_device_fn': None, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': 'models/model_deep1544058705'}
[<tf.Tensor 'DecodeCSV:0' shape=(?, 1) dtype=int32>, <tf.Tensor 'DecodeCSV:1' shape=(?, 1) dtype=string>, <tf.Tensor 'DecodeCSV:2' shape=(?, 1) dtype=int32>, <tf.Tensor 'DecodeCSV:3' shape=(?, 1) dtype=string>, <tf.Tenso

#### 3. Wide & Deep Model

In [19]:
def wide_n_deep_classification_model():
    model_dir = 'models/model_' + 'widendeep' + str(int(time.time()))
    print("model directory = %s" % model_dir)
    
    # define classifier
    model = learn.DNNLinearCombinedClassifier(
        model_dir=model_dir,
        linear_feature_columns=wide_columns,
        dnn_feature_columns=deep_columns,
        dnn_hidden_units=[100,70,50,25]
    )
    
    # set dataset
    train_dataset = str("adult.data.csv")
    
    # fit model
    model.fit(input_fn=generate_input_fn(train_dataset, BATCH_SIZE), steps=1000)
    print("training finish!")
    
    return model

In [20]:
model3 = wide_n_deep_classification_model()

model directory = models/model_widendeep1544058712
Instructions for updating:
Please set fix_global_step_increment_bug=True and update training steps in your pipeline. See pydoc for details.
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x127ea3fd0>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_train_distribute': None, '_device_fn': None, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': 'models/model_widendeep1544058712'}
[<tf.Tensor 'DecodeCSV:0' shape=(?, 1) dtype=int32>, <tf.Tensor 'Decod

## Evaluate the model

In [87]:
test_dataset  = str("adult.test.csv") 

results1 = model1.evaluate(input_fn=generate_input_fn(test_dataset), 
                     steps=200)
print(results1)
print('Results1 Accuracy: %s' % results1['accuracy'])

INFO:tensorflow:Starting evaluation at 2018-08-10-08:31:46
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/model_linear1533889885/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [20/200]
INFO:tensorflow:Evaluation [40/200]
INFO:tensorflow:Evaluation [60/200]
INFO:tensorflow:Evaluation [80/200]
INFO:tensorflow:Evaluation [100/200]
INFO:tensorflow:Evaluation [120/200]
INFO:tensorflow:Evaluation [140/200]
INFO:tensorflow:Evaluation [160/200]
INFO:tensorflow:Evaluation [180/200]
INFO:tensorflow:Evaluation [200/200]
INFO:tensorflow:Finished evaluation at 2018-08-10-08:31:47
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.834125, accuracy/baseline_label_mean = 0.0, accuracy/threshold_0.500000_mean = 0.834125, auc = 1.0, auc_precision_recall = 0.0, global_step = 1000, labels/actual_label_mean = 0.0, labels/prediction_mean = 0.22233997, loss = 0.34135503, precision

In [88]:
test_dataset  = str("adult.test.csv") 

results2 = model2.evaluate(input_fn=generate_input_fn(test_dataset), 
                     steps=200)
print(results2)
print('Results2 Accuracy: %s' % results2['accuracy'])

INFO:tensorflow:Starting evaluation at 2018-08-10-08:31:48
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/model_deep1533889890/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [20/200]
INFO:tensorflow:Evaluation [40/200]
INFO:tensorflow:Evaluation [60/200]
INFO:tensorflow:Evaluation [80/200]
INFO:tensorflow:Evaluation [100/200]
INFO:tensorflow:Evaluation [120/200]
INFO:tensorflow:Evaluation [140/200]
INFO:tensorflow:Evaluation [160/200]
INFO:tensorflow:Evaluation [180/200]
INFO:tensorflow:Evaluation [200/200]
INFO:tensorflow:Finished evaluation at 2018-08-10-08:31:49
INFO:tensorflow:Saving dict for global step 1000: accuracy = 1.0, accuracy/baseline_label_mean = 0.0, accuracy/threshold_0.500000_mean = 1.0, auc = 1.0, auc_precision_recall = 0.0, global_step = 1000, labels/actual_label_mean = 0.0, labels/prediction_mean = 0.24562778, loss = 0.28841776, precision/positive_th

In [112]:
test_dataset  = str("adult.test.csv") 

results3 = model3.evaluate(input_fn=generate_input_fn(test_dataset), 
                     steps=200)
print(results3)
print('Results3 Accuracy: %s' % results3['accuracy'])

INFO:tensorflow:Starting evaluation at 2018-08-10-09:08:36
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/model_widendeep1533891699/model.ckpt-1002
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [20/200]
INFO:tensorflow:Evaluation [40/200]
INFO:tensorflow:Evaluation [60/200]
INFO:tensorflow:Evaluation [80/200]
INFO:tensorflow:Evaluation [100/200]
INFO:tensorflow:Evaluation [120/200]
INFO:tensorflow:Evaluation [140/200]
INFO:tensorflow:Evaluation [160/200]
INFO:tensorflow:Evaluation [180/200]
INFO:tensorflow:Evaluation [200/200]
INFO:tensorflow:Finished evaluation at 2018-08-10-09:08:38
INFO:tensorflow:Saving dict for global step 1002: accuracy = 0.847875, accuracy/baseline_label_mean = 0.0, accuracy/threshold_0.500000_mean = 0.847875, auc = 1.0000001, auc_precision_recall = 0.0, global_step = 1002, labels/actual_label_mean = 0.0, labels/prediction_mean = 0.20990957, loss = 0.32295182, 

## Export model

In [108]:
from tensorflow.contrib.learn.python.learn.utils import input_fn_utils

# category type -> string, numerical type -> float32
def column_to_dtype(column):
    if column in CATEGORICAL_COLUMNS:
        return tf.string
    else:
        return tf.float32

def serving_input_fn():
    feature_placeholders = {
        column: tf.placeholder(column_to_dtype(column), [None])
        for column in FEATURE_COLUMNS
    }
    # DNNCombinedLinearClassifier expects rank 2 Tensors, but inputs should be
    # rank 1, so that we can provide scalars to the server
    features = {
        key: tensor[:, np.newaxis] # tf.expand_dims(tensor, axis=-1)
        for key, tensor in feature_placeholders.items()
    }
    
    return input_fn_utils.InputFnOps(
        features, # input into graph
        None,
        feature_placeholders # tensor input converted from request 
    )

In [109]:
export_folder = model3.export_savedmodel(
    export_dir_base = 'model_widendeep1533891699' + '/export',
    serving_input_fn=serving_input_fn
)

print('model exported successfully to {}'.format(export_folder))

Instructions for updating:
Switch to tf.estimator.Exporter and associated utilities.
Instructions for updating:
Switch to tf.estimator.Exporter and associated utilities.
Instructions for updating:
Switch to tf.estimator.Exporter and associated utilities.
Instructions for updating:
Switch to tf.estimator.Exporter and associated utilities.
INFO:tensorflow:Restoring parameters from models/model_widendeep1533891699/model.ckpt-1002
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: model_widendeep1533891699/export/temp-1533891822/saved_model.pb
model exported successfully to b'model_widendeep1533891699/export/1533891822'


## Reference

https://github.com/amygdala/tensorflow-workshop/blob/c62cfa3cd766cf0adf6d8fae7a289ae9e4ab161b/workshop_sections/wide_n_deep/wide_n_deep_flow2.ipynb