# Tensorflow Estimator

In [2]:
import tensorflow as tf

import pandas as pd

import numpy as np 

from sklearn.model_selection import train_test_split

wines_df = pd.read_csv("../data/winequality.csv")



# tf doesn't like spaces in col names so I replace them with _ 
new_col_list = []
for col_name in wines_df.columns:
    new_col_names = col_name.replace(" ", "_")
    new_col_list.append(new_col_names)
wines_df.columns = new_col_list


# Change the type of the index values from [0....1598] to [wine_1....wine_1599]

wines_df.index += 1 # add 1 to index values to start wine specification from wine_1 rather than wine_0

index_as_string = wines_df.index.astype('str')

wines_df.index = 'wine_' + index_as_string


In [None]:
display(wines_df)

## Splitting the dataset to training, validation and test sets

In [3]:
# shuffle the data of the wines_df

wines_df = wines_df.sample(frac=1) 

# almost 70% training , 15% validation, 15% test set

intermediate_set, valid_set = train_test_split(wines_df, test_size=0.15) 
train_set, test_set = train_test_split(intermediate_set, test_size=0.15)

## Input functions

In [4]:
def train_input_func(dataframe=train_set, batch_size=33):
    
    
    dataframe = dataframe.copy() 
    
    label_values = dataframe.pop('quality')
    
    
    # normalization
    for name in dataframe.columns:
        min_value = dataframe[name].min()
        max_value = dataframe[name].max()
        dataframe[name] = (dataframe[name] - min_value)/(max_value-min_value) 
    #

    dataset = tf.data.Dataset.from_tensor_slices((dict(dataframe), label_values))
    
    dataset = dataset.batch(batch_size)
    
    return dataset



In [5]:
def eval_input_func(dataframe=test_set, batch_size=33):
    
    
    dataframe = dataframe.copy() 
    
    label_values = dataframe.pop('quality')
    
    
    # normalization
    for name in dataframe.columns:
        min_value = dataframe[name].min()
        max_value = dataframe[name].max()
        dataframe[name] = (dataframe[name] - min_value)/(max_value-min_value) 
    #

    dataset = tf.data.Dataset.from_tensor_slices((dict(dataframe), label_values))
    
    dataset = dataset.batch(batch_size)
    
    return dataset

In [None]:
def serving_input_func():
    pass

## Feature Columns

In [6]:
# feature columns


feature_columns = []
feature_layer_inputs = {}

for column in train_set.columns:
    if column != 'quality':
        feature_columns.append(tf.feature_column.numeric_column(column))
        feature_layer_inputs[column] = tf.keras.Input(shape=(1,), name = column)


## Model Function

### Declaration and modes

In [None]:
def model_fn(features, labels, mode):
    
    
    
    
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
feature_layer_outputs = feature_layer(feature_layer_inputs)

h1 = tf.keras.layers.Dense(128, activation='relu')(feature_layer_outputs)

h2 = tf.keras.layers.Dense(64, activation = 'relu')(h1)

out = tf.keras.layers.Dense(9, activation = 'softmax')(h2)


model = tf.keras.Model(inputs=[v for v in feature_layer_inputs.values()], outputs=out)
    
if mode == tf.estimator.ModeKeys.TRAIN:
        
        
    loss = tf.keras.loss
    train_op = 
    predictions = None
        
        
elif mode == tf.estimator.ModeKeys.EVAL:
        
    loss = 
    train_op = None
    predictions = None
        
elif mode == tf.estimator.ModeKeys.PREDICT:
        
    loss = None
    train_op = None
    predictions = 
        
    

### Network Architecture

In [17]:
# used functional api

feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
feature_layer_outputs = feature_layer(feature_layer_inputs)

h1 = tf.keras.layers.Dense(128, activation='relu')(feature_layer_outputs)

h2 = tf.keras.layers.Dense(64, activation = 'relu')(h1)

out = tf.keras.layers.Dense(9, activation = 'softmax')(h2)


model = tf.keras.Model(inputs=[v for v in feature_layer_inputs.values()], outputs=out)

#model = tf.keras.models.Sequential()
#model.add(tf.keras.layers.DenseFeatures(feature_columns))
#model.add(tf.keras.layers.Dense(200, activation='relu'))
#model.add(tf.keras.layers.Dense(8,activation='softmax'))




In [46]:
tf.print(feature_layer_outputs.graph)

<tensorflow.python.framework.func_graph.FuncGraph object at 0x7f9c9c9cbfd0>


float32


In [18]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer="sgd",
              metrics=["accuracy"])

In [19]:
train_spec = tf.estimator.TrainSpec(input_fn=train_input_func, max_steps=1000)
eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_func)

In [20]:
estimator = tf.keras.estimator.model_to_estimator(
  keras_model = model
)


INFO:tensorflow:Using default config.
INFO:tensorflow:Using the Keras model provided.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpjy80ogg3', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f9c37198a90>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [21]:
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

INFO:tensorflow:Not using Distribute Coordinator.
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 600.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Warm-starting with WarmStartSettings: WarmStartSettings(ckpt_to_initialize_from='/tmp/tmpjy80ogg3/keras/keras_model.ckpt', vars_to_warm_start='.*', var_name_to_vocab_info={}, var_name_to_prev_var_name={})
INFO:tensorflow:Warm-starting from: /tmp/tmpjy80ogg3/keras/keras_model.ckpt
INFO:tensorflow:Warm-starting variables only in TRAINABLE_VARIABLES.
INFO:tensorflow:Warm-started 6 variables.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensor

({'accuracy': 0.37745097, 'loss': 1.8767463, 'global_step': 35}, [])

## Premade Estimator

### Instantiate an estimator

In [None]:
wine_classifier = tf.estimator.Estimator(
    model_fn, model_dir=None, config=None, params=None, warm_start_from=None
)

In [None]:
# Build a DNN with 2 hidden layers with 30 and 10 hidden nodes each.
classifier = tf.estimator.DNNClassifier(
    feature_columns=my_feature_columns,
    # Two hidden layers of 30 and 10 nodes respectively.
    hidden_units=[30,10],
    # The model must choose between 6 classes. [3-8]
    n_classes=6,
    label_vocabulary=STR_QUALITIES)

#### Problems with estimator compiling



Possible solution for qualities:


https://stackoverflow.com/questions/45813746/tensorflow-invalid-argument-assertation-failed-label-ids-must-n-classes

1. observed_qualities = [3,4,5,6,7,8] 

2. amount of qualities = 6 = n_classes


doulepse otan ebala gia count classes = 9 dhladh timh megaluterh tou max observed quality 8?

h logikh m leei oti tha eprepe na douleuei gia count classes = 6


tf docs


* n_classes: Number of label classes. Defaults to 2, namely binary classification. Must be > 1.

    I have multiclass classification with 6 classes so num_classes should be 6 am I right?
    
* label_vocabulary: A list of strings represents possible label values. If given, labels **===label_values=== train_y_string** must be string type and have any value in label_vocabulary. If it is not given, that means labels are already encoded as integer or float within [0, 1] for n_classes=2 and encoded as integer values in {0, 1,..., n_classes-1} for n_classes>2 . Also there will be errors if vocabulary is not provided and labels are string.



### Solutions


*Solution 1*


* Use STR_QUALITIES = ['3', '4', '5', '6', '7', '8'] as a list of strings.

* Use num_classes = 6 

* In tf.estimator.DNNClassifier use the local_vocabulary attribute and place it equal to STR_QUALITIES

* Convert the elements of train_y series namely the label values (TF calls the label values Labels) to string type via train_y_string_labels = train_y.apply(str)


*Solution 2*

* Use QUALITIES = [3,4,5,6,7,8] as a list of integers.

* use n_classes = 9 which which is the first higher value than the maximum label value (8) of train_y. 

* no need for local_vocabulary attribute here.



### Train, Evaluate and Predict

In [None]:
# Train the Model.

# steps = number_of_batches per epoch = 231 batches per epoch 

# batch_size = 55 

# number_of_training_samples = 1155 * 11 = 12705

# number_of_batches = 12705/55 = 231 in the training set

# 1 epoch equals to a parsing of the whole train_set

wine_classifier.train(
    input_fn=lambda: train_input_func(train_set, , training=True),
    steps=231)