# 1. TensorFlow Estimators
## Basic imports 

In [6]:
import numpy as np
import tensorflow as tf
import pandas as pd

tf.random.set_seed(1)
np.random.seed(1)

## 1. Load the data and apply the necessary preprocessing steps
- Preprocessing involves partitioning the dataset into training and testing datasets, as well as standardizing the continuous features
### 1. Download and import the dataset

In [7]:
dataset_path = tf.keras.utils.get_file("auto-mpg.data", 
                                       ("http://archive.ics.uci.edu/ml/machine-learning-databases"
                                        "/auto-mpg/auto-mpg.data"))

column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower',
                'Weight', 'Acceleration', 'ModelYear', 'Origin']

df = pd.read_csv(dataset_path, names=column_names,
                 na_values = "?", comment='\t',
                 sep=" ", skipinitialspace=True)

df.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,ModelYear,Origin
393,27.0,4,140.0,86.0,2790.0,15.6,82,1
394,44.0,4,97.0,52.0,2130.0,24.6,82,2
395,32.0,4,135.0,84.0,2295.0,11.6,82,1
396,28.0,4,120.0,79.0,2625.0,18.6,82,1
397,31.0,4,119.0,82.0,2720.0,19.4,82,1


### 2. Drop rows that contain empty values

In [8]:
print(df.isna().sum())

df = df.dropna()
df = df.reset_index(drop=True)
df.tail()

MPG             0
Cylinders       0
Displacement    0
Horsepower      6
Weight          0
Acceleration    0
ModelYear       0
Origin          0
dtype: int64


Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,ModelYear,Origin
387,27.0,4,140.0,86.0,2790.0,15.6,82,1
388,44.0,4,97.0,52.0,2130.0,24.6,82,2
389,32.0,4,135.0,84.0,2295.0,11.6,82,1
390,28.0,4,120.0,79.0,2625.0,18.6,82,1
391,31.0,4,119.0,82.0,2720.0,19.4,82,1


### 3. Split the dataset

In [9]:
import sklearn
import sklearn.model_selection

df_train, df_test = sklearn.model_selection.train_test_split(df, train_size=0.8)
train_stats = df_train.describe().transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MPG,313.0,23.404153,7.666909,9.0,17.5,23.0,29.0,46.6
Cylinders,313.0,5.402556,1.701506,3.0,4.0,4.0,8.0,8.0
Displacement,313.0,189.51278,102.675646,68.0,104.0,140.0,260.0,455.0
Horsepower,313.0,102.929712,37.919046,46.0,75.0,92.0,120.0,230.0
Weight,313.0,2961.198083,848.602146,1613.0,2219.0,2755.0,3574.0,5140.0
Acceleration,313.0,15.704473,2.725399,8.5,14.0,15.5,17.3,24.8
ModelYear,313.0,75.929712,3.675305,70.0,73.0,76.0,79.0,82.0
Origin,313.0,1.591054,0.807923,1.0,1.0,1.0,2.0,3.0


### 4. Standardize the continuous ("numerical") features

In [10]:
numeric_column_names = ['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration']

df_train_norm, df_test_norm = df_train.copy(), df_test.copy()

for col_name in numeric_column_names:
    mean = train_stats.loc[col_name, 'mean']
    std  = train_stats.loc[col_name, 'std']
    df_train_norm.loc[:, col_name] = (df_train_norm.loc[:, col_name] - mean)/std
    df_test_norm.loc[:, col_name] = (df_test_norm.loc[:, col_name] - mean)/std
    
df_train_norm.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,ModelYear,Origin
203,28.0,-0.824303,-0.90102,-0.736562,-0.950031,0.255202,76,3
255,19.4,0.351127,0.4138,-0.340982,0.29319,0.548737,78,1
72,13.0,1.526556,1.144256,0.713897,1.339617,-0.625403,72,1
235,30.5,-0.824303,-0.89128,-1.053025,-1.072585,0.475353,77,1
37,14.0,1.526556,1.563051,1.636916,1.47042,-1.35924,71,1


### 5. Transform the continuous features into the feature column data structure that TensorFlow Estimators can work with
 * See definition: https://developers.google.com/machine-learning/glossary/#feature_columns
 * Documentation: https://www.tensorflow.org/api_docs/python/tf/feature_column

In [11]:
numeric_features = []

for col_name in numeric_column_names:
    numeric_features.append(tf.feature_column.numeric_column(key=col_name))
    
numeric_features

[NumericColumn(key='Cylinders', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Displacement', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Horsepower', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Weight', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Acceleration', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

### 6. Group the model year information into buckets
- The chosen intervals for "bucketing" are arbitrary

In [12]:
feature_year = tf.feature_column.numeric_column(key="ModelYear")

bucketized_features = []

bucketized_features.append(tf.feature_column.bucketized_column(
    source_column=feature_year,
    boundaries=[73, 76, 79]))

print(bucketized_features)

[BucketizedColumn(source_column=NumericColumn(key='ModelYear', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(73, 76, 79))]


### 7. Define a list for the unordered categorical feature, Origin
- We can use tf.feature_column.categorical_column_with_identity() function if the features are already associated with an index of categories in the range [0, num_categories)
- However, in this case we cannot do that since the indices of Origin start from 1 and not from 0 as required, so we proceed with the vocabulary list

In [14]:
feature_origin = tf.feature_column.categorical_column_with_vocabulary_list(
    key='Origin',
    vocabulary_list=[1, 2, 3])

### 8. Convert the existing categorical feature column to a dense column
- Certain Estimators oncly accept so-called "dense columns"
- We can convert an existing categorical feature column to a dense column using an embedding column or an indicator column

In [15]:
categorical_indicator_features = []
categorical_indicator_features.append(tf.feature_column.indicator_column(feature_origin))

print(categorical_indicator_features)

[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='Origin', vocabulary_list=(1, 2, 3), dtype=tf.int32, default_value=-1, num_oov_buckets=0))]


## 2. Machine learning with pre-made Estimators
### Steps for using pre-made estimators

 * **Step 1:** Define the input function for importing the data   
 * **Step 2:**  Define the feature columns to bridge between the estimator and the data   
 * **Step 3:** Instantiate an estimator or convert a Keras model to an estimator   
 * **Step 4:** Use the estimator: train() evaluate() predict()

### 1. Define the input function for importing the data
- For the first step, we need to define a function that processes the data and returns a TensorFlow dataset cosisting of a tuple that contains the input features and the labels (ground truth MPG values)
- The features must be in a dictionary format, and the keys of the dictionary must match the feature columns' names

In [16]:
def train_input_fn(df_train, batch_size=8):
    df = df_train.copy()
    train_x, train_y = df, df.pop('MPG')
    dataset = tf.data.Dataset.from_tensor_slices((dict(train_x), train_y))

    # shuffle, repeat, and batch the examples
    return dataset.shuffle(1000).repeat().batch(batch_size)

## inspection
ds = train_input_fn(df_train_norm)
batch = next(iter(ds))
print('Keys:', batch[0].keys())
print('Batch Model Years:', batch[0]['ModelYear'])

Keys: dict_keys(['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'ModelYear', 'Origin'])
Batch Model Years: tf.Tensor([82 78 76 72 78 73 70 78], shape=(8,), dtype=int32)


### 2. Define the feature columns
- We have already defined 3 lists for the continuous features, the bucketized feature column, and categorical feature column

In [17]:
all_feature_columns = (numeric_features + 
                       bucketized_features + 
                       categorical_indicator_features)

print(all_feature_columns)

[NumericColumn(key='Cylinders', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='Displacement', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='Horsepower', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='Weight', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='Acceleration', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), BucketizedColumn(source_column=NumericColumn(key='ModelYear', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(73, 76, 79)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='Origin', vocabulary_list=(1, 2, 3), dtype=tf.int32, default_value=-1, num_oov_buckets=0))]


### 3. Instantiate an estimator

In [18]:
regressor = tf.estimator.DNNRegressor(
    feature_columns=all_feature_columns,
    hidden_units=[32, 10],
    model_dir='models/autompg-dnnregressor/')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'models/autompg-dnnregressor/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001CEB40147C8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


### 4.1. Finally, train the regressor
- The regressor can be trained by calling the train() method, for which we require the previously defined input function

In [19]:
EPOCHS = 1000
BATCH_SIZE = 8
total_steps = EPOCHS * int(np.ceil(len(df_train) / BATCH_SIZE))
print('Training Steps:', total_steps)

regressor.train(
    input_fn=lambda:train_input_fn(df_train_norm, batch_size=BATCH_SIZE),
    steps=total_steps)

 = 16.67799, step = 20300 (0.111 sec)
INFO:tensorflow:global_step/sec: 836.577
INFO:tensorflow:loss = 14.104445, step = 20400 (0.120 sec)
INFO:tensorflow:global_step/sec: 907.438
INFO:tensorflow:loss = 28.777363, step = 20500 (0.110 sec)
INFO:tensorflow:global_step/sec: 933.412
INFO:tensorflow:loss = 29.093853, step = 20600 (0.107 sec)
INFO:tensorflow:global_step/sec: 860.592
INFO:tensorflow:loss = 15.665294, step = 20700 (0.117 sec)
INFO:tensorflow:global_step/sec: 697.006
INFO:tensorflow:loss = 62.035095, step = 20800 (0.142 sec)
INFO:tensorflow:global_step/sec: 774.869
INFO:tensorflow:loss = 11.360391, step = 20900 (0.130 sec)
INFO:tensorflow:global_step/sec: 819.339
INFO:tensorflow:loss = 49.79663, step = 21000 (0.122 sec)
INFO:tensorflow:global_step/sec: 800.556
INFO:tensorflow:loss = 23.985207, step = 21100 (0.125 sec)
INFO:tensorflow:global_step/sec: 786.984
INFO:tensorflow:loss = 28.185383, step = 21200 (0.128 sec)
INFO:tensorflow:global_step/sec: 657.048
INFO:tensorflow:loss =

<tensorflow_estimator.python.estimator.canned.dnn.DNNRegressorV2 at 0x1ceb4020c48>

### 4.2. Reloading the last checkpoint
- Calling .train() will automatically save the checkpoints during the training of the model

In [20]:
reloaded_regressor = tf.estimator.DNNRegressor(
    feature_columns=all_feature_columns,
    hidden_units=[32, 10],
    warm_start_from='models/autompg-dnnregressor/',
    model_dir='models/autompg-dnnregressor/')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'models/autompg-dnnregressor/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001CEB4CD9988>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


### 4.3. Evaluate the predictive performance of the trained model

In [21]:
def eval_input_fn(df_test, batch_size=8):
    df = df_test.copy()
    test_x, test_y = df, df.pop('MPG')
    dataset = tf.data.Dataset.from_tensor_slices((dict(test_x), test_y))

    return dataset.batch(batch_size)

eval_results = reloaded_regressor.evaluate(
    input_fn=lambda:eval_input_fn(df_test_norm, batch_size=8))

for key in eval_results:
    print('{:15s} {}'.format(key, eval_results[key]))
    
print('Average-Loss {:.4f}'.format(eval_results['average_loss']))

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-07-26T23:10:08Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/autompg-dnnregressor/model.ckpt-40000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2020-07-26-23:10:08
INFO:tensorflow:Saving dict for global step 40000: average_loss = 18.712437, global_step = 40000, label/mean = 23.611393, loss = 18.588703, prediction/mean = 21.755104
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 40000: models/autompg-dnnregressor/model.ckpt-40000
average_loss   

### 4.4. Predict the target values on new data points

In [22]:
pred_res = regressor.predict(input_fn=lambda: eval_input_fn(df_test_norm, batch_size=8))

print(next(iter(pred_res)))

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/autompg-dnnregressor/model.ckpt-40000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
{'predictions': array([22.291758], dtype=float32)}


### 4.5. Train a different pre-made Estimator

In [23]:
boosted_tree = tf.estimator.BoostedTreesRegressor(
    feature_columns=all_feature_columns,
    n_batches_per_layer=20,
    n_trees=200)

boosted_tree.train(
    input_fn=lambda:train_input_fn(df_train_norm, batch_size=BATCH_SIZE))

eval_results = boosted_tree.evaluate(
    input_fn=lambda:eval_input_fn(df_test_norm, batch_size=8))

print(eval_results)

print('Average-Loss {:.4f}'.format(eval_results['average_loss']))

ensorflow:loss = 0.22530511, step = 5980 (0.218 sec)
INFO:tensorflow:global_step/sec: 433.85
INFO:tensorflow:loss = 0.09022692, step = 6080 (0.237 sec)
INFO:tensorflow:global_step/sec: 421.444
INFO:tensorflow:loss = 0.14605376, step = 6180 (0.238 sec)
INFO:tensorflow:global_step/sec: 414.011
INFO:tensorflow:loss = 0.08655515, step = 6280 (0.280 sec)
INFO:tensorflow:global_step/sec: 362.307
INFO:tensorflow:loss = 0.025377247, step = 6380 (0.278 sec)
INFO:tensorflow:global_step/sec: 361.249
INFO:tensorflow:loss = 0.05110468, step = 6480 (0.297 sec)
INFO:tensorflow:global_step/sec: 342.469
INFO:tensorflow:loss = 0.044005483, step = 6580 (0.218 sec)
INFO:tensorflow:global_step/sec: 448.969
INFO:tensorflow:loss = 0.15501904, step = 6680 (0.225 sec)
INFO:tensorflow:global_step/sec: 451.42
INFO:tensorflow:loss = 0.14872582, step = 6780 (0.228 sec)
INFO:tensorflow:global_step/sec: 440.52
INFO:tensorflow:loss = 0.05451867, step = 6880 (0.235 sec)
INFO:tensorflow:global_step/sec: 417.3
INFO:tens