# Estimators in Tensorflow 2.0
Inspired by https://www.tensorflow.org/alpha/tutorials/estimators/linear

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

In [15]:
df = pd.read_csv('../data/titanic-train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [18]:
df.pop('Cabin')
df.loc[:, 'Age'] = df['Age'].fillna(df['Age'].mean())
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    889 non-null int64
Survived       889 non-null int64
Pclass         889 non-null int64
Name           889 non-null object
Sex            889 non-null object
Age            889 non-null float64
SibSp          889 non-null int64
Parch          889 non-null int64
Ticket         889 non-null object
Fare           889 non-null float64
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(4)
memory usage: 83.3+ KB


In [19]:
y = df.pop('Survived')

In [20]:
from sklearn.model_selection import train_test_split

In [35]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=0)

In [38]:
cat_cols = ['Sex', 'SibSp', 'Parch', 'Pclass', 'Embarked']
num_cols = ['Age', 'Fare']

feature_columns = []

for c in cat_cols:
    vocabulary = X_train[c].unique()
    feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(c, vocabulary))

for c in num_cols:
    feature_columns.append(tf.feature_column.numeric_column(c, dtype=tf.float32))

In [39]:
feature_columns

[VocabularyListCategoricalColumn(key='Sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0),
 VocabularyListCategoricalColumn(key='SibSp', vocabulary_list=(1, 0, 2, 4, 3, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0),
 VocabularyListCategoricalColumn(key='Parch', vocabulary_list=(0, 1, 2, 3, 5, 4, 6), dtype=tf.int64, default_value=-1, num_oov_buckets=0),
 VocabularyListCategoricalColumn(key='Pclass', vocabulary_list=(3, 1, 2), dtype=tf.int64, default_value=-1, num_oov_buckets=0),
 VocabularyListCategoricalColumn(key='Embarked', vocabulary_list=('Q', 'C', 'S'), dtype=tf.string, default_value=-1, num_oov_buckets=0),
 NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Fare', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

In [40]:
def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32):
    def input_function():
        ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
        if shuffle:
            ds = ds.shuffle(1000)
        ds = ds.batch(batch_size).repeat(num_epochs)
        return ds
    return input_function

train_input_fn = make_input_fn(X_train, y_train)
eval_input_fn = make_input_fn(X_test, y_test, num_epochs=1, shuffle=False)

In [41]:
ds = make_input_fn(X_train, y_train, batch_size=10)()
for feature_batch, label_batch in ds.take(1):
    print('Some feature keys:', list(feature_batch.keys()))
    print()
    print('A batch of Pclass:', feature_batch['Pclass'].numpy())
    print()
    print('A batch of Labels:', label_batch.numpy())

Some feature keys: ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Embarked']

A batch of Pclass: [3 3 1 3 1 2 3 3 3 3]

A batch of Labels: [0 0 1 0 1 0 0 0 1 0]


In [53]:
model = tf.estimator.LinearClassifier(feature_columns=feature_columns,
                                      model_dir='/tmp/tensorboard/linear_estimator/')

In [50]:
model.train(train_input_fn)

<tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2 at 0x7f05f7612ac8>

In [51]:
result = model.evaluate(eval_input_fn)

In [52]:
result

{'accuracy': 0.6573034,
 'accuracy_baseline': 0.5898876,
 'auc': 0.77677757,
 'auc_precision_recall': 0.71410656,
 'average_loss': 0.834942,
 'label/mean': 0.41011235,
 'loss': 0.8057956,
 'precision': 0.5508475,
 'prediction/mean': 0.6711619,
 'recall': 0.89041096,
 'global_step': 230}

## Exercise

Tensorflow 2.0 implements many other estimators besides the LinearClassifier:

- BaselineClassifier
- BaselineEstimator
- BaselineRegressor
- BoostedTreesClassifier
- BoostedTreesRegressor
- DNNClassifier
- DNNEstimator
- DNNRegressor
- DNNLinearCombinedClassifier
- DNNLinearCombinedEstimator
- DNNLinearCombinedRegressor
- LinearClassifier
- LinearEstimator
- LinearRegressor

Pick one or more of the above estimators and re-train them.