# Feature Columns in Tensorflow 2.0

Inspired by https://www.tensorflow.org/alpha/tutorials/keras/feature_columns

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('../data/australian_credit.csv')

- Data adapted from [here](https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data)
- Attributes from [here](https://www.researchgate.net/publication/3297254_A_Compact_and_Accurate_Model_for_Classification)


|Column| Values| Type|
| :--- | :---  | :--- |
|A1 (Sex) |  0, 1 |Nominal|
|A2 (Age) |  13.75 - 80.25 |Continuous|
|A3 (Mean time at addresses) |  0 - 28 |Continuous|
|A4 (Home status) |  1, 2, 3 |Nominal|
|A5 (Current occupation) |  1 - 14 |Nominal|
|A6 (Current job status) |  1 - 9 |Nominal|
|A7 (Mean time with employers) |  0 - 28.5 |Continuous|
|A8 (Other investments) |  0, 1 |Nominal|
|A9 (Bank account) |  0, 1 |Nominal|
|A10 (Time with bank) |  0 - 67 |Continuous|
|A11 (Liability reference) |  0, 1 |Nominal|
|A12 (Account reference) |  1, 2, 3 |Nominal|
|A13 (Monthly housing expense) |  0 - 2000 |Continuous|
|A14 (Savings account balance) |  1 - 100001 |Continuous| 

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_val, test = train_test_split(df, test_size=0.2, random_state=0)
train, val = train_test_split(train_val, test_size=0.2, random_state=0)

In [None]:
train.shape

In [None]:
val.shape

In [None]:
test.shape

## Batch generation with tf.data.Dataset

In [None]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('class')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [None]:
batch_size = 5

In [None]:
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [None]:
for feature_batch, label_batch in train_ds.take(1):
    print('Every feature:', list(feature_batch.keys()))
    print('A batch of ages:', feature_batch['age'])
    print('A batch of targets:', label_batch )

In [None]:
example_batch = next(iter(train_ds))[0]

## Feature Columns

In [None]:
def demo(feature_column):
    feature_layer = tf.keras.layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())

In [None]:
age = tf.feature_column.numeric_column("age")
demo(age)

In [None]:
age_buckets = tf.feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
demo(age_buckets)

In [None]:
occupation_vocab = df['occupation'].unique()
occupation_vocab

In [None]:
occupation = tf.feature_column.categorical_column_with_vocabulary_list(
      'occupation', occupation_vocab)
occupation_one_hot = tf.feature_column.indicator_column(occupation)
demo(occupation_one_hot)

In [None]:
occupation_embedding = tf.feature_column.embedding_column(
    occupation, dimension=8)
demo(occupation_embedding)

In [None]:
occupation_hashed = tf.feature_column.categorical_column_with_hash_bucket(
      'occupation', hash_bucket_size=1000)
occupation_hashed = tf.feature_column.indicator_column(occupation_hashed)
demo(occupation_hashed)

In [None]:
crossed_feature = tf.feature_column.crossed_column(
    [age_buckets, occupation], hash_bucket_size=1000)
crossed_feature = tf.feature_column.indicator_column(crossed_feature)
demo(crossed_feature)

In [None]:
numeric_cols = ['age', 'time_at_addr', 'time_w_empl',
                'time_w_bank', 'monthly_housing', 'savings_balance']

In [None]:
feature_columns = []

for c in numeric_cols:
    feature_columns.append(tf.feature_column.numeric_column(c))
    
feature_columns.append(age_buckets)
feature_columns.append(occupation_one_hot)
feature_columns.append(occupation_embedding)
feature_columns.append(crossed_feature)

## Model Train and Evaluate

### Baseline

In [None]:
classes_ratio = df['class'].value_counts() / len(df)
classes_ratio

In [None]:
baseline = classes_ratio[0]
baseline

### Model

In [None]:
from tensorflow.keras.layers import Dense, DenseFeatures
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam, RMSprop

In [None]:
model = Sequential([
  DenseFeatures(feature_columns),
  Dense(128, activation='relu'),
  Dense(64, activation='relu'),
  Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(lr=0.0001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [None]:
h = model.fit(train_ds, 
              validation_data=val_ds, 
              epochs=15)

In [None]:
pd.DataFrame(h.history).plot()
plt.ylim(0, 1)
plt.axhline(baseline, c='black');

In [None]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

## Exercise 

Add other feature columns and iterate to improve the model