# Premade Estimator

## Train, Valid, Test Split

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import pandas as pd

from sklearn.model_selection import train_test_split

import json

In [None]:
from platform import python_version
print(python_version())

In [2]:
wines_df = pd.read_csv("../data/winequality.csv")

In [None]:
display(wines_df)


In [3]:
# tf doesn't like spaces in col names so I replace them with _ 
new_col_list = []
for col_name in wines_df.columns:
    new_col_names = col_name.replace(" ", "_")
    new_col_list.append(new_col_names)
print(new_col_list)
wines_df.columns = new_col_list


['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']


In [24]:
# defining a few helpful constants for parsing the dataset

CSV_COLUMN_NAMES = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']



QUALITIES = [1,2,3,4,5,6,7,8,9,10]

OBSERVED_QUALITIES = [3,4,5,6,7,8]





Possible solution for qualities:

1. set observed_qualities = [3,4,5,6,7,8]

2. set count_classes to len(observed_qualities) + 1  = 6 + 1 = 7



doulepse otan ebala gia count classes = 9 dhladh timh megaluterh tou max observed quality 8?

h logikh m leei oti tha eprepe na douleuei gia count classes = 6

In [20]:
count_classes = 9

We need to supply a training, a validation and a test set to TF, so we have to split the dataset to three separate datasets.

In [6]:
from sklearn.model_selection import train_test_split

wines_df = wines_df.sample(frac=1) # shuffle the data of the wines_df

# almost 70% training , 15% validation, 15% test set
train_valid_df, test_set = train_test_split(wines_df, test_size=0.15) 
train_set, valid_set = train_test_split(train_valid_df, test_size=0.15)


train_y = train_set.pop('quality')
test_y = test_set.pop('quality')

# The target label column has now been removed from the features.
train_set.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol
1560,7.8,0.6,0.26,2.0,0.08,31.0,131.0,0.99622,3.21,0.52,9.9
307,10.3,0.41,0.42,2.4,0.213,6.0,14.0,0.9994,3.19,0.62,9.5
1070,9.3,0.33,0.45,1.5,0.057,19.0,37.0,0.99498,3.18,0.89,11.1
59,7.3,0.39,0.31,2.4,0.074,9.0,46.0,0.9962,3.41,0.54,9.4
491,9.2,0.41,0.5,2.5,0.055,12.0,25.0,0.9952,3.34,0.79,13.3


In [None]:
# json

data_srs = wines_df.iloc[0,0:-1]

my_json = data_srs.to_json()

my_json

## Create a dataset input function

You must create input functions to supply data for training, evaluating, and prediction.

An input function is a function that returns a tf.data.Dataset object which outputs the following two-element tuple:

* features - A Python dictionary in which:
    * Each key is the name of a feature.
    * Each value is an array containing all of the feature's values.
* label - A tensor containing the values of the target label for every example.

Just to demonstrate the format of the input function, here's a simple implementation:

In [7]:
def a_basic_input_function():
    features = {'fixed_acidity': np.array([6.9, 6.2, 7.1]),
                'volatile_acidity': np.array([0.685, 0.58 , 0.43 ]),
                'citric_acid': np.array([0.  , 0.  , 0.42]),
                'residual_sugar': np.array([2.5, 1.6, 5.5]),
                'chlorides': np.array([0.105, 0.065, 0.07]),
                'free_sulfur_dioxide': np.array([22.,  8., 29.]),
                'total_sulfur_dioxide': np.array([37.,  18., 129.]),
                'density': np.array([0.9966, 0.9966, 0.9973]),
                'pH': np.array([3.46, 3.56, 3.42]),
                'sulphates': np.array([0.57, 0.84, 0.72]),
                'alcohol':np.array([10.6,  9.4, 10.5])}
    labels = np.array([6, 5, 6])
    return features, labels

Your input function may generate the features dictionary and label list any way you like. However, we recommend using TensorFlow's Dataset API, which can parse all sorts of data.

The Dataset API can handle a lot of common cases for you. For example, using the Dataset API, you can easily read in records from a large collection of files in parallel and join them into a single stream.

To keep things simple in this example you are going to load the data with pandas, and build an input pipeline from this in-memory data:

**features (dict)** keys: CSV_COLUMN_NAMES [0:-1] (namely excluding the label of the target variable) , values: np.arrays of the features values 

**labels (np.array)** the values of the target variable

In [8]:
CSV_COLUMN_NAMES[0:-1]

['fixed_acidity',
 'volatile_acidity',
 'citric_acid',
 'residual_sugar',
 'chlorides',
 'free_sulfur_dioxide',
 'total_sulfur_dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol']

In [21]:


def input_fn(features = CSV_COLUMN_NAMES[0:-1] , labels = OBSERVED_QUALITIES, training=True, batch_size=55):
    """ 
    
    An input function for the training and evaluation procedures 
    
    """
    # Convert the inputs to a Dataset. 
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
    # Shuffle and repeat if you are in training mode.
    if training:
        dataset = dataset.shuffle(1000).repeat()
    
    return dataset.batch(batch_size)
        
                                                 
                                                 

## Define the feature columns

In [10]:
# Feature columns describe how to use the input.
my_feature_columns = []
for key in train_set.keys():
    my_feature_columns.append(tf.feature_column.numeric_column(key=key))

## Instantiate an estimator

In [22]:
# Build a DNN with 2 hidden layers with 30 and 10 hidden nodes each.
classifier = tf.estimator.DNNClassifier(
    feature_columns=my_feature_columns,
    # Two hidden layers of 30 and 10 nodes respectively.
    hidden_units=[30,10],
    # The model must choose between 6 classes. [3-8]
    n_classes=count_classes)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp69sagwdz', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f48808b7d10>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


## Train, Evaluate and Predict

In [23]:
# Train the Model.

# steps = number_of_batches per epoch = 231 batches per epoch 

# batch_size = 55 

# number_of_training_samples = 1155 * 11 = 12705

# number_of_batches = 12705/55 = 231 in the training set

# 1 epoch equals to a parsing of the whole train_set

classifier.train(
    input_fn=lambda: input_fn(train_set, train_y, training=True),
    steps=231)

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmp69sagwdz/model.ckpt.
INFO:tensorflow:loss = 2.3465486, step = 0
INFO:tensorflow:global_step/sec: 259.932
INFO:tensorflow:loss = 1.9587638, step = 100 (0.386 sec)
INFO:tensorflow:global_step/sec: 403.867
INFO:tensorflow:loss = 1.9142903, step = 200 (0.248 sec)
INFO:tensorflow:Saving checkpoints for 231 into /tmp/tmp69sagwdz/model.ckpt.
INFO:tensorflow:Loss for final step: 1.8755589.


<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2 at 0x7f4877da3c10>

In [None]:
eval_result = classifier.evaluate(input_fn=lambda: input_fn(test, test_y, training=False))
print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))

Unlike the call to the train method, *you did not pass the steps argument to evaluate*. The input_fn for eval only yields a **single epoch** of data.

The eval_result dictionary also contains *the average_loss* (mean loss per sample), *the loss* (mean loss per mini-batch) and the value of the *estimator's global_step* (the number of training iterations it underwent).

In [None]:
print(eval_result)

## Making predictions (inferring) from the trained model

In [None]:
# Generate predictions from the model
expected = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
predict_x = {
    'fixed_acidity': [7.1, 5.6, 0.7],
    'volatile_acidity': [0.150, 0.760, 0,352],
    'citric_acid': [0.0, 0.25, 0.13],
    'residual_sugar': [0.3, 1.5, 2.4],
    'chlorides': [0.034, 0.012, 0.056],
    'free_sulfur_dioxide': [14.0, 12.0, 15.0],
    'total_sulfur_dioxide':[45.0, 12.0, 56.0],
    'density':[0.98334, 0.96423, 0.9731],
    'pH':[3.12, 3.56, 3.78],
    'sulphates':[0.56, 0.75, 0.67],
    'alcohol':[12.5, 11.2, 10.3]


}

def input_fn(features, batch_size=256):
    """An input function for prediction."""
    # Convert the inputs to a Dataset without labels.
    return tf.data.Dataset.from_tensor_slices(dict(features)).batch(batch_size)

predictions = classifier.predict(
    input_fn=lambda: input_fn(predict_x))

In [None]:
print(predictions.)

The predict method returns a Python iterable, yielding a dictionary of prediction results for each example. The following code prints a few predictions and their probabilities:

In [None]:
for pred_dict, expec in zip(predictions, expected):
    class_id = pred_dict['class_ids'][0]
    probability = pred_dict['probabilities'][class_id]

    print('Prediction is "{}" ({:.1f}%), expected "{}"'.format(
        SPECIES[class_id], 100 * probability, expec))

# Custom Estimator 

1. input_func : transforms raw data to Dataset objects.

2. feature_func : function that defines the feature cols of the datasets

3. model_func : heart of the estimator. This func specifies the type of model used to make predictions and its characteristics e.g DNN with k layers so on and so forth

4. train_func, eval_func, test_func : functions relevant to implement the training, evaluation and testing procedures.


## Input_func

Importance (why we need an input func in our workflow?)


Functionality (what does an input func do?)


Implementation (how does the input func do what it is supposed to do?)

In [None]:
input_func(csv) ----> [train_set, valid_set, test_set]

In [None]:
def input_func():
    ...  # manipulate dataset, extracting the feature dict and the label
    return feature_dict, label


## Feature_func

feature_func(csv_header) ------> (features, target)



* We need to define the data type for every attribute column.

* We need to normalize each attribute according to its type and value.

In [None]:
# Define the feature columns including their names and type of data they contain.

def feature_func(csv_header):

    population = tf.feature_column.numeric_column('population')
    crime_rate = tf.feature_column.numeric_column('crime_rate')
    median_education = tf.feature_column.numeric_column(
        'median_education', normalizer_fn=lambda x: x - global_education_mean)

## Model_func or Model_class? probably the 

In [None]:
model_func(feature_columns, hidden_units = [ some_layer_1_nodes , ... , some_layer_n_nodes], n_classes = 8 ) -----> wine.classifier

In [None]:
# Instantiate an estimator, by passing in the feature columns.


def model_func(feature_columns, hidden_units = [ some_layer_1_nodes , ... , some_layer_n_nodes], n_classes = 8 ):
    # using premade at first then extend it to custom
    wine_classifier = 

    
    

class BPSomeClass(object):
    """Brief class description
    
    Some more extensive description
    
    Attributes
    ----------
    attr1 : string
        Purpose of attr1.
    attr2 : float
        Purpose of attr2.
    
    """
    
    def __init__(self, param1, param2, param3=0):
        """Example of docstring on the __init__ method.
        
        Parameters
        ----------
        param1 : str
            Description of `param1`.
        param2 : float
            Description of `param2`.
        param3 : int, optional
            Description of `param3`, defaults to 0.
        
        """
        self.attr1 = param1
        self.attr2 = param2
        print(param3 // 4)
    
    @property
    def attribute2(self):
        return self.attr2
    
    @attribute2.setter
    def attribute2(self, new_attr2):
        if not isinstance(float, new_attr2):
            raise ValueError("attribute2 must be a float, not {0}".format(new_attr2))
        self.attr2 = new_attr2


bp_obj = BPSomeClass("a", 1.618)
print(bp_obj.attribute2)
bp_obj.attribute2 = 3.236

















# Wine.Classifier Methods

## train_method

In [None]:
# `input_fn` is the function created in Step 1

def train_func(arg):
    estimator.train(input_func=train_set, steps=2000)
    pass


## val_method

In [None]:
def eval_func(arg):
    estimator.eval(input_func=eval_set, .....)
    pass


## test_method


In [None]:
def test_func(arg):
    estimator.test(input_func=test_set, .....)
    pass