# STEP 3 - Feature Selection

A central model for tff has been found.
The model with all features proved to be less accurate.
The next step is to evaluate which features should be selected for the best prediction quality.
This is done by training models on all possible feature subsets and comparing the results.
The most important features are to be expected:

* the temporal features (all time components including is_weekday)
* user id

## Imports

In [17]:
import tensorflow as tf
from tensorflow import feature_column
import itertools
import pandas as pd
import numpy as np
import import_ipynb

In [18]:
from model_helper import ModelHelper

## Central Model

This model is roughly the same model as tee second model used for the categories but in the following the prediction is analyzed manually in order to figure out which patterns are learnt.

In [19]:
df = pd.read_csv("./4square/processed_transformed_big.csv")
df.head(100)

Unnamed: 0,cat_id,user_id,latitude,longitude,is_weekend,clock_sin,clock_cos,day_sin,day_cos,month_sin,month_cos,week_day_sin,week_day_cos,venue_id,orig_cat_id
0,0,470,40.719810,-74.002581,False,-1.000000,0.000654,0.587785,0.809017,0.866025,-0.5,0.781831,0.62349,0,0
1,1,979,40.606800,-74.044170,False,-0.999998,0.001818,0.587785,0.809017,0.866025,-0.5,0.781831,0.62349,1,1
2,2,69,40.716162,-73.883070,False,-0.999945,0.010472,0.587785,0.809017,0.866025,-0.5,0.781831,0.62349,2,2
3,3,395,40.745164,-73.982519,False,-0.999931,0.011708,0.587785,0.809017,0.866025,-0.5,0.781831,0.62349,3,3
4,4,87,40.740104,-73.989658,False,-0.999914,0.013090,0.587785,0.809017,0.866025,-0.5,0.781831,0.62349,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,7,445,40.828602,-73.879259,False,-0.959601,0.281365,0.587785,0.809017,0.866025,-0.5,0.781831,0.62349,93,24
96,6,235,40.745463,-73.990983,False,-0.956326,0.292302,0.587785,0.809017,0.866025,-0.5,0.781831,0.62349,94,6
97,8,118,40.600144,-73.946593,False,-0.955729,0.294249,0.587785,0.809017,0.866025,-0.5,0.781831,0.62349,95,57
98,2,1054,40.870630,-74.097926,False,-0.955407,0.295291,0.587785,0.809017,0.866025,-0.5,0.781831,0.62349,96,58


In [20]:
# the number of different categories defines the vocabulary size
categories = df.cat_id
vocab_size = categories.nunique()

print('vocabulary size:', vocab_size)

vocabulary size: 27


In [21]:
EMBEDDING_DIM = 64

rnn_units1 = 128
rnn_units2 = 64

# Create a model
def create_keras_model():

  vocab_size = mh.vocab_size
  N = mh.total_window_length
  batch_size = mh.batch_size

  # Shortcut to the layers package
  l = tf.keras.layers

  # List of numeric feature columns to pass to the DenseLayer
  numeric_feature_columns = []

  # Handling numerical columns
  for header in numerical_column_names:
		# Append all the numerical columns defined into the list
    numeric_feature_columns.append(feature_column.numeric_column(header, shape=N-1))

  feature_inputs={}
  for c_name in numerical_column_names:
    feature_inputs[c_name] = tf.keras.Input((N-1,), batch_size=batch_size, name=c_name)

  # We cannot use an array of features as always because we have sequences
  # We have to do one by one in order to match the shape
  num_features = []
  for c_name in numerical_column_names:
    f =  feature_column.numeric_column(c_name, shape=(N-1))
    feature = l.DenseFeatures(f)(feature_inputs)
    feature = tf.expand_dims(feature, -1)
    num_features.append(feature)

  categorical_feature_inputs = []
  categorical_features = []
  for categorical_feature in categorical_columns:  # add batch_size=batch_size in case of stateful GRU
    d = {categorical_feature.feature_name: tf.keras.Input((N-1,), batch_size=batch_size, dtype=tf.dtypes.int32, name=categorical_feature.feature_name)}
    categorical_feature_inputs.append(d)

    one_hot = feature_column.sequence_categorical_column_with_vocabulary_list(categorical_feature.feature_name, [i for i in range(categorical_feature.vocab_size)])

    if categorical_feature.use_embedding:
      # Embed the one-hot encoding
      categorical_features.append(feature_column.embedding_column(one_hot, EMBEDDING_DIM))
    else:
      categorical_features.append(feature_column.indicator_column(one_hot))

  seq_features = []
  for i in range(0, len(categorical_feature_inputs)):
    sequence_features, sequence_length = tf.keras.experimental.SequenceFeatures(categorical_features[i])(categorical_feature_inputs[i])
    seq_features.append(sequence_features)

  input_sequence = l.Concatenate(axis=2)( [] + seq_features + num_features)

  # Rnn
  recurrent = l.GRU(rnn_units1,
                    batch_size=batch_size, #in case of stateful
                    return_sequences=True,
                    stateful=True,
                    recurrent_initializer='glorot_uniform')(input_sequence)

  recurrent_2 = l.GRU(rnn_units2,
                      batch_size=batch_size, #in case of stateful
                      stateful=True,
                      recurrent_initializer='glorot_uniform')(recurrent)


	# Last layer with an output for each place
  dense_1 = l.Dense(vocab_size)(recurrent)

	# Softmax output layer
  output = l.Softmax()(dense_1)

	# To return the Model, we need to define its inputs and outputs
	# In out case, we need to list all the input layers we have defined
  inputs = list(feature_inputs.values()) + categorical_feature_inputs

	# Return the Model
  return tf.keras.Model(inputs=inputs, outputs=output)

In [22]:
class CategoricalFeature:
  def __init__(self, feature_name, vocab_size, use_embedding):
    self.feature_name = feature_name
    self.vocab_size = vocab_size
    self.use_embedding = use_embedding

In [23]:
vocab_size = df.cat_id.unique().size
users_size = df.user_id.unique().size
venues_size = df.venue_id.unique().size
orig_cats_size = df.orig_cat_id.unique().size

In [24]:
all_num_column_names = ['latitude', 'longitude', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin',
                          'month_cos', 'week_day_sin', 'week_day_cos']

drop_columns = [['user_id'], ['latitude', 'longitude'], ['is_weekend'], ['venue_id'], ['orig_cat_id']]

all_cat_columns = [
      CategoricalFeature('user_id', users_size, True),
      CategoricalFeature('cat_id', vocab_size, True),
      CategoricalFeature('venue_id', venues_size, True),
      CategoricalFeature('orig_cat_id', orig_cats_size, True)]

for L in range(1, len(drop_columns) + 1):
    for subset in itertools.combinations(drop_columns, L):

        mh = ModelHelper(df.copy(), 17)
        mh.set_vocab_size(vocab_size)

        cols = [item for sub_list in subset for item in sub_list]
        column_names = [i for i in df.columns.values if i not in cols]
        numerical_column_names = [i for i in all_num_column_names if i not in cols]
        categorical_columns = [i for i in all_cat_columns if i.feature_name not in cols]

        print('Excluded columns: {c}'.format(c=cols))
        print('Included columns: {c}'.format(c=column_names))

        mh.set_target_column_name('cat_id')

        mh.set_numerical_column_names(numerical_column_names)

        mh.set_column_names(column_names)
        mh.set_client_column_name('user_id')
        mh.set_client_column_ids()

        mh.create_users_locations_from_df()
        mh.concat_split_users_locations(drop_client_column=False)

        mh.split_data()

        mh.set_batch_size(16)

        mh.create_and_batch_datasets(multi_target=True)

        mh.assign_model(create_keras_model())
        mh.set_num_epochs(10)
        mh.compile_model()
        mh.fit_model(verbose=0)

        mh.evaluate_model()

Excluded columns: ['user_id']
Included columns: ['cat_id', 'latitude', 'longitude', 'is_weekend', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos', 'venue_id', 'orig_cat_id']


100%|██████████| 1083/1083 [00:00<00:00, 2031.89it/s]


Restoring model weights from the end of the best epoch.
Epoch 00008: early stopping
Excluded columns: ['latitude', 'longitude']
Included columns: ['cat_id', 'user_id', 'is_weekend', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos', 'venue_id', 'orig_cat_id']


100%|██████████| 1083/1083 [00:00<00:00, 1990.81it/s]


Restoring model weights from the end of the best epoch.
Epoch 00006: early stopping
Excluded columns: ['is_weekend']
Included columns: ['cat_id', 'user_id', 'latitude', 'longitude', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos', 'venue_id', 'orig_cat_id']


100%|██████████| 1083/1083 [00:00<00:00, 2039.42it/s]


Excluded columns: ['venue_id']
Included columns: ['cat_id', 'user_id', 'latitude', 'longitude', 'is_weekend', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos', 'orig_cat_id']


100%|██████████| 1083/1083 [00:00<00:00, 2024.30it/s]


Excluded columns: ['orig_cat_id']
Included columns: ['cat_id', 'user_id', 'latitude', 'longitude', 'is_weekend', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos', 'venue_id']


100%|██████████| 1083/1083 [00:00<00:00, 1994.46it/s]


Excluded columns: ['user_id', 'latitude', 'longitude']
Included columns: ['cat_id', 'is_weekend', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos', 'venue_id', 'orig_cat_id']


100%|██████████| 1083/1083 [00:00<00:00, 2005.49it/s]


Restoring model weights from the end of the best epoch.
Epoch 00006: early stopping
Excluded columns: ['user_id', 'is_weekend']
Included columns: ['cat_id', 'latitude', 'longitude', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos', 'venue_id', 'orig_cat_id']


100%|██████████| 1083/1083 [00:00<00:00, 2026.11it/s]


Excluded columns: ['user_id', 'venue_id']
Included columns: ['cat_id', 'latitude', 'longitude', 'is_weekend', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos', 'orig_cat_id']


100%|██████████| 1083/1083 [00:00<00:00, 2024.31it/s]


Excluded columns: ['user_id', 'orig_cat_id']
Included columns: ['cat_id', 'latitude', 'longitude', 'is_weekend', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos', 'venue_id']


100%|██████████| 1083/1083 [00:00<00:00, 1987.16it/s]


Excluded columns: ['latitude', 'longitude', 'is_weekend']
Included columns: ['cat_id', 'user_id', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos', 'venue_id', 'orig_cat_id']


100%|██████████| 1083/1083 [00:00<00:00, 1987.88it/s]


Excluded columns: ['latitude', 'longitude', 'venue_id']
Included columns: ['cat_id', 'user_id', 'is_weekend', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos', 'orig_cat_id']


100%|██████████| 1083/1083 [00:00<00:00, 1979.89it/s]


Restoring model weights from the end of the best epoch.
Epoch 00009: early stopping
Excluded columns: ['latitude', 'longitude', 'orig_cat_id']
Included columns: ['cat_id', 'user_id', 'is_weekend', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos', 'venue_id']


100%|██████████| 1083/1083 [00:00<00:00, 2024.31it/s]


Restoring model weights from the end of the best epoch.
Epoch 00008: early stopping
Excluded columns: ['is_weekend', 'venue_id']
Included columns: ['cat_id', 'user_id', 'latitude', 'longitude', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos', 'orig_cat_id']


100%|██████████| 1083/1083 [00:00<00:00, 2051.61it/s]


Excluded columns: ['is_weekend', 'orig_cat_id']
Included columns: ['cat_id', 'user_id', 'latitude', 'longitude', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos', 'venue_id']


100%|██████████| 1083/1083 [00:00<00:00, 2009.68it/s]


Excluded columns: ['venue_id', 'orig_cat_id']
Included columns: ['cat_id', 'user_id', 'latitude', 'longitude', 'is_weekend', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos']


100%|██████████| 1083/1083 [00:00<00:00, 2058.93it/s]


Restoring model weights from the end of the best epoch.
Epoch 00008: early stopping
Excluded columns: ['user_id', 'latitude', 'longitude', 'is_weekend']
Included columns: ['cat_id', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos', 'venue_id', 'orig_cat_id']


100%|██████████| 1083/1083 [00:00<00:00, 2031.89it/s]


Excluded columns: ['user_id', 'latitude', 'longitude', 'venue_id']
Included columns: ['cat_id', 'is_weekend', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos', 'orig_cat_id']


100%|██████████| 1083/1083 [00:00<00:00, 2028.09it/s]


Excluded columns: ['user_id', 'latitude', 'longitude', 'orig_cat_id']
Included columns: ['cat_id', 'is_weekend', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos', 'venue_id']


100%|██████████| 1083/1083 [00:00<00:00, 2024.30it/s]


Excluded columns: ['user_id', 'is_weekend', 'venue_id']
Included columns: ['cat_id', 'latitude', 'longitude', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos', 'orig_cat_id']


100%|██████████| 1083/1083 [00:00<00:00, 2024.30it/s]


Excluded columns: ['user_id', 'is_weekend', 'orig_cat_id']
Included columns: ['cat_id', 'latitude', 'longitude', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos', 'venue_id']


100%|██████████| 1083/1083 [00:00<00:00, 2051.14it/s]


Restoring model weights from the end of the best epoch.
Epoch 00004: early stopping
Excluded columns: ['user_id', 'venue_id', 'orig_cat_id']
Included columns: ['cat_id', 'latitude', 'longitude', 'is_weekend', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos']


100%|██████████| 1083/1083 [00:00<00:00, 2055.02it/s]


Excluded columns: ['latitude', 'longitude', 'is_weekend', 'venue_id']
Included columns: ['cat_id', 'user_id', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos', 'orig_cat_id']


100%|██████████| 1083/1083 [00:00<00:00, 2009.28it/s]


Excluded columns: ['latitude', 'longitude', 'is_weekend', 'orig_cat_id']
Included columns: ['cat_id', 'user_id', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos', 'venue_id']


100%|██████████| 1083/1083 [00:00<00:00, 2035.72it/s]


Restoring model weights from the end of the best epoch.
Epoch 00009: early stopping
Excluded columns: ['latitude', 'longitude', 'venue_id', 'orig_cat_id']
Included columns: ['cat_id', 'user_id', 'is_weekend', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos']


100%|██████████| 1083/1083 [00:00<00:00, 2043.82it/s]


Restoring model weights from the end of the best epoch.
Epoch 00009: early stopping
Excluded columns: ['is_weekend', 'venue_id', 'orig_cat_id']
Included columns: ['cat_id', 'user_id', 'latitude', 'longitude', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos']


100%|██████████| 1083/1083 [00:00<00:00, 2024.30it/s]


Excluded columns: ['user_id', 'latitude', 'longitude', 'is_weekend', 'venue_id']
Included columns: ['cat_id', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos', 'orig_cat_id']


100%|██████████| 1083/1083 [00:00<00:00, 2024.30it/s]


Excluded columns: ['user_id', 'latitude', 'longitude', 'is_weekend', 'orig_cat_id']
Included columns: ['cat_id', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos', 'venue_id']


100%|██████████| 1083/1083 [00:00<00:00, 2005.57it/s]


Restoring model weights from the end of the best epoch.
Epoch 00007: early stopping
Excluded columns: ['user_id', 'latitude', 'longitude', 'venue_id', 'orig_cat_id']
Included columns: ['cat_id', 'is_weekend', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos']


100%|██████████| 1083/1083 [00:00<00:00, 2043.38it/s]


Excluded columns: ['user_id', 'is_weekend', 'venue_id', 'orig_cat_id']
Included columns: ['cat_id', 'latitude', 'longitude', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos']


100%|██████████| 1083/1083 [00:00<00:00, 2024.28it/s]


Excluded columns: ['latitude', 'longitude', 'is_weekend', 'venue_id', 'orig_cat_id']
Included columns: ['cat_id', 'user_id', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos']


100%|██████████| 1083/1083 [00:00<00:00, 2001.86it/s]


Restoring model weights from the end of the best epoch.
Epoch 00010: early stopping
Excluded columns: ['user_id', 'latitude', 'longitude', 'is_weekend', 'venue_id', 'orig_cat_id']
Included columns: ['cat_id', 'clock_sin', 'clock_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'week_day_sin', 'week_day_cos']


100%|██████████| 1083/1083 [00:00<00:00, 1994.47it/s]


