In [34]:
import datetime
import logging
import os

In [35]:
import numpy as np 
import tensorflow as tf

In [36]:
from tensorflow.keras import activations
from tensorflow.keras import callbacks
from tensorflow.keras import layers
from tensorflow.keras import models

from tensorflow import feature_column as fc

In [37]:
logging.info(tf.version.VERSION)

In [None]:
import pandas as pd 

In [5]:
CSV_COLUMNS = [
        'fare_amount',
        'pickup_datetime',
        'pickup_longitude',
        'pickup_latitude',
        'dropoff_longitude',
        'dropoff_latitude',
        'passenger_count',
        'key',
]
LABEL_COLUMN = 'fare_amount'
DEFAULTS = [[0.0], ['na'], [0.0], [0.0], [0.0], [0.0], [0.0], ['na']]
DAYS = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat']


In [6]:
def create_train_dataset(pattern, batch_size):
    dataset = load_dataset(pattern, batch_size, num_repeat=None)
    return dataset.prefetch(1)

In [7]:
# return tf.data.Dataset
def load_dataset(pattern, batch_size, num_repeat):
    dataset = tf.data.experimental.make_csv_dataset(
        file_pattern=pattern,
        batch_size=batch_size,
        column_names=CSV_COLUMNS,
        column_defaults=DEFAULTS,
        num_epochs=num_repeat,
    )
    return dataset.map(features_and_labels)

In [8]:
def features_and_labels(row_data):
    for unwanted_col in ['key']:
        row_data.pop(unwanted_col)
    label = row_data.pop(LABEL_COLUMN)
    return row_data, label

In [9]:
import argparse

In [10]:
parser = argparse.ArgumentParser()
parser.add_argument(
    "--batch_size",
    help="Batch size for training steps",
    type=int,
    default=32
)
parser.add_argument(
    "--eval_data_path",
    help="GCS location pattern of eval files",
    required=True
)
parser.add_argument(
    "--nnsize",
    help="Hidden layer sizes (provide space-separated sizes)",
    nargs="+",
    type=int,
    default=[32, 8]
)
parser.add_argument(
    "--nbuckets",
    help="Number of buckets to divide lat and lon with",
    type=int,
    default=10
)
parser.add_argument(
    "--lr",
    help = "learning rate for optimizer",
    type = float,
    default = 0.001
)
parser.add_argument(
    "--num_evals",
    help="Number of times to evaluate model on eval data training.",
    type=int,
    default=5
)
parser.add_argument(
    "--num_examples_to_train_on",
    help="Number of examples to train on.",
    type=int,
    default=100
)
parser.add_argument(
    "--output_dir",
    help="GCS location to write checkpoints and export models",
    required=True
)
parser.add_argument(
    "--train_data_path",
    help="GCS location pattern of train files containing eval URLs",
    required=True
)
parser.add_argument(
    "--job-dir",
    help="this model ignores this field, but it is required by gcloud",
    default="junk"
)
args = parser.parse_args()
hparams = args.__dict__
hparams.pop("job-dir", None)

usage: ipykernel_launcher.py [-h] [--batch_size BATCH_SIZE] --eval_data_path
                             EVAL_DATA_PATH [--nnsize NNSIZE [NNSIZE ...]]
                             [--nbuckets NBUCKETS] [--lr LR]
                             [--num_evals NUM_EVALS]
                             [--num_examples_to_train_on NUM_EXAMPLES_TO_TRAIN_ON]
                             --output_dir OUTPUT_DIR --train_data_path
                             TRAIN_DATA_PATH [--job-dir JOB_DIR]
ipykernel_launcher.py: error: the following arguments are required: --eval_data_path, --output_dir, --train_data_path


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [11]:
trainds = create_train_dataset("./taxi-train.csv", 32)

In [12]:
traintest = tf.data.experimental.make_csv_dataset(
        file_pattern="./taxi-train.csv",
        batch_size=32,
        column_names=CSV_COLUMNS,
        column_defaults=DEFAULTS,
        num_epochs=None,
    )

In [13]:
example = traintest.take(1)

In [14]:
print(example)

<_TakeDataset element_spec=OrderedDict([('fare_amount', TensorSpec(shape=(32,), dtype=tf.float32, name=None)), ('pickup_datetime', TensorSpec(shape=(32,), dtype=tf.string, name=None)), ('pickup_longitude', TensorSpec(shape=(32,), dtype=tf.float32, name=None)), ('pickup_latitude', TensorSpec(shape=(32,), dtype=tf.float32, name=None)), ('dropoff_longitude', TensorSpec(shape=(32,), dtype=tf.float32, name=None)), ('dropoff_latitude', TensorSpec(shape=(32,), dtype=tf.float32, name=None)), ('passenger_count', TensorSpec(shape=(32,), dtype=tf.float32, name=None)), ('key', TensorSpec(shape=(32,), dtype=tf.string, name=None))])>


In [18]:
e1 = example.map(features_and_labels)

In [19]:
print(e1)

<_MapDataset element_spec=(OrderedDict([('pickup_datetime', TensorSpec(shape=(32,), dtype=tf.string, name=None)), ('pickup_longitude', TensorSpec(shape=(32,), dtype=tf.float32, name=None)), ('pickup_latitude', TensorSpec(shape=(32,), dtype=tf.float32, name=None)), ('dropoff_longitude', TensorSpec(shape=(32,), dtype=tf.float32, name=None)), ('dropoff_latitude', TensorSpec(shape=(32,), dtype=tf.float32, name=None)), ('passenger_count', TensorSpec(shape=(32,), dtype=tf.float32, name=None))]), TensorSpec(shape=(32,), dtype=tf.float32, name=None))>


In [20]:
!ls

taxi-train.csv taxi-valid.csv taxifare.ipynb


In [21]:
!wc -l taxi-train.csv

    1000 taxi-train.csv


In [25]:
import pandas as pd

In [26]:
df = pd.read_csv("taxi-train.csv")

In [27]:
df.head(10)

Unnamed: 0,11.3,2011-01-28 20:42:59 UTC,-73.999022,40.739146,-73.990369,40.717866,1,0
0,7.7,2011-06-27 04:28:06 UTC,-73.987443,40.729221,-73.979013,40.758641,1,1
1,10.5,2011-04-03 00:54:53 UTC,-73.982539,40.735725,-73.954797,40.778388,1,2
2,16.2,2009-04-10 04:11:56 UTC,-74.001945,40.740505,-73.91385,40.758559,1,3
3,33.5,2014-02-24 18:22:00 UTC,-73.993372,40.753382,-73.8609,40.732897,2,4
4,6.9,2011-12-10 00:25:23 UTC,-73.996237,40.721848,-73.989416,40.718052,1,5
5,6.1,2012-09-01 14:30:19 UTC,-73.977048,40.758461,-73.984899,40.744693,2,6
6,9.5,2012-11-08 13:28:07 UTC,-73.969402,40.757545,-73.950049,40.776079,1,7
7,9.0,2014-07-15 11:37:25 UTC,-73.979318,40.760949,-73.95767,40.773724,1,8
8,3.3,2009-11-09 18:06:58 UTC,-73.955675,40.779154,-73.961172,40.772368,1,9
9,17.0,2014-09-14 21:52:28 UTC,-73.993789,40.749181,-73.951233,40.770045,2,10


In [28]:
def transform(inputs, NUMERIC_COLS, STRING_COLS, nbuckets):
    # Pass-through columns
    transformed = inputs.copy()
    del transformed['pickup_datetime']

    feature_columns = {
        colname: fc.numeric_column(colname)
        for colname in NUMERIC_COLS
    }

    # Scaling longitude from range [-70, -78] to [0, 1]
    for lon_col in ['pickup_longitude', 'dropoff_longitude']:
        transformed[lon_col] = layers.Lambda(
            lambda x: (x + 78)/8.0,
            name='scale_{}'.format(lon_col)
        )(inputs[lon_col])

    # Scaling latitude from range [37, 45] to [0, 1]
    for lat_col in ['pickup_latitude', 'dropoff_latitude']:
        transformed[lat_col] = layers.Lambda(
            lambda x: (x - 37)/8.0,
            name='scale_{}'.format(lat_col)
        )(inputs[lat_col])

    # Adding Euclidean dist (no need to be accurate: NN will calibrate it)
    transformed['euclidean'] = layers.Lambda(euclidean, name='euclidean')([
        inputs['pickup_longitude'],
        inputs['pickup_latitude'],
        inputs['dropoff_longitude'],
        inputs['dropoff_latitude']
    ])
    feature_columns['euclidean'] = fc.numeric_column('euclidean')

    # hour of day from timestamp of form '2010-02-08 09:17:00+00:00'
    transformed['hourofday'] = layers.Lambda(
        lambda x: tf.strings.to_number(
            tf.strings.substr(x, 11, 2), out_type=tf.dtypes.int32),
        name='hourofday'
    )(inputs['pickup_datetime'])
    feature_columns['hourofday'] = fc.indicator_column(
        fc.categorical_column_with_identity(
            'hourofday', num_buckets=24))

    latbuckets = np.linspace(0, 1, nbuckets).tolist()
    lonbuckets = np.linspace(0, 1, nbuckets).tolist()
    b_plat = fc.bucketized_column(
        feature_columns['pickup_latitude'], latbuckets)
    b_dlat = fc.bucketized_column(
            feature_columns['dropoff_latitude'], latbuckets)
    b_plon = fc.bucketized_column(
            feature_columns['pickup_longitude'], lonbuckets)
    b_dlon = fc.bucketized_column(
            feature_columns['dropoff_longitude'], lonbuckets)
    ploc = fc.crossed_column(
            [b_plat, b_plon], nbuckets * nbuckets)
    dloc = fc.crossed_column(
            [b_dlat, b_dlon], nbuckets * nbuckets)
    pd_pair = fc.crossed_column([ploc, dloc], nbuckets ** 4)
    feature_columns['pickup_and_dropoff'] = fc.embedding_column(
            pd_pair, 100)

    return transformed, feature_columns

In [29]:
print(example)

<_TakeDataset element_spec=OrderedDict([('fare_amount', TensorSpec(shape=(32,), dtype=tf.float32, name=None)), ('pickup_datetime', TensorSpec(shape=(32,), dtype=tf.string, name=None)), ('pickup_longitude', TensorSpec(shape=(32,), dtype=tf.float32, name=None)), ('pickup_latitude', TensorSpec(shape=(32,), dtype=tf.float32, name=None)), ('dropoff_longitude', TensorSpec(shape=(32,), dtype=tf.float32, name=None)), ('dropoff_latitude', TensorSpec(shape=(32,), dtype=tf.float32, name=None)), ('passenger_count', TensorSpec(shape=(32,), dtype=tf.float32, name=None)), ('key', TensorSpec(shape=(32,), dtype=tf.string, name=None))])>


In [31]:
STRING_COLS = ['pickup_datetime']
NUMERIC_COLS = (
            set(CSV_COLUMNS) - set([LABEL_COLUMN, 'key']) - set(STRING_COLS)
    )

In [33]:
import tensorflow.compact.v1 as tf1

ModuleNotFoundError: No module named 'tensorflow.compact'

In [32]:
def call_feature_columns(feature_columns, inputs):
  # This is a convenient way to call a `feature_column` outside of an estimator
  # to display its output.
  feature_layer = tf1.keras.layers.DenseFeatures(feature_columns)
  return feature_layer(inputs)

In [None]:
import tensorflow as tf

# Assume 'feature1' and 'feature2' are original features
feature1 = tf.feature_column.numeric_column("feature1")
feature2 = tf.feature_column.numeric_column("feature2")

# Custom lambda function to combine features
combine_features = tf.keras.layers.Lambda(lambda x: tf.math.multiply(x[:, 0], x[:, 1]))

# Use Lambda layer to create a new feature
combined_feature = combine_features([feature1, feature2])

# Apply further transformations using tf.feature_column
# For example, create a bucketized column
combined_feature_bucketized = tf.feature_column.bucketized_column(combined_feature, boundaries=[0, 10, 20, 30])

# Continue with the rest of the feature columns
