# STEP 5 - Dynamic Windowing

Using a simple model with dynamic windowing resulted in a non-existent prediction accuracy.
One possible solution for that, is the simplicity of the model itself.
The original work uses a rather complex model with SequenceFeatures and Embedding for the One-Hot-Encoded target features.
Next, it is to be verified if using a more complex model leads to the expected prediction quality.
If that is not the case, the problem is probably with the windowing itself.
It could be that the used functions shuffle the sequences or somehow change the data, so the deep learning network can not detect patterns anymore.

In [1]:
import tensorflow as tf
from tensorflow import feature_column
import pandas as pd
import numpy as np
import import_ipynb

## Testing sequence_stride

One assumption why the performance is lacking is the data itself.
Maybe using only the data of one taxi to build the sequences is not sufficient.
The sliding window function creates many sequences but the data itself is not heterogeneous enough.
The test involves using a larger part of the original data and creating less window sequences by using the parameter "sequence_stride".
This param makes it possible to skip sequences by moving the starting index multiple values ahead.
An example follows.

In [2]:
def timeseries_dataset_from_array(
        data,
        targets,
        sequence_length,
        sequence_stride=1,
        sampling_rate=1,
        batch_size=128,
        shuffle=False,
        seed=None,
        start_index=None,
        end_index=None,
):
    if start_index:
        if start_index < 0:
            raise ValueError(
                "`start_index` must be 0 or greater. Received: "
                f"start_index={start_index}"
            )
        if start_index >= len(data):
            raise ValueError(
                "`start_index` must be lower than the length of the "
                f"data. Received: start_index={start_index}, for data "
                f"of length {len(data)}"
            )
    if end_index:
        if start_index and end_index <= start_index:
            raise ValueError(
                "`end_index` must be higher than `start_index`. "
                f"Received: start_index={start_index}, and "
                f"end_index={end_index} "
            )
        if end_index >= len(data):
            raise ValueError(
                "`end_index` must be lower than the length of the "
                f"data. Received: end_index={end_index}, for data of "
                f"length {len(data)}"
            )
        if end_index <= 0:
            raise ValueError(
                "`end_index` must be higher than 0. "
                f"Received: end_index={end_index}"
            )

    # Validate strides
    if sampling_rate <= 0:
        raise ValueError(
            "`sampling_rate` must be higher than 0. Received: "
            f"sampling_rate={sampling_rate}"
        )
    if sampling_rate >= len(data):
        raise ValueError(
            "`sampling_rate` must be lower than the length of the "
            f"data. Received: sampling_rate={sampling_rate}, for data "
            f"of length {len(data)}"
        )
    if sequence_stride <= 0:
        raise ValueError(
            "`sequence_stride` must be higher than 0. Received: "
            f"sequence_stride={sequence_stride}"
        )
    if sequence_stride >= len(data):
        raise ValueError(
            "`sequence_stride` must be lower than the length of the "
            f"data. Received: sequence_stride={sequence_stride}, for "
            f"data of length {len(data)}"
        )

    if start_index is None:
        start_index = 0
    if end_index is None:
        end_index = len(data)

    # Determine the lowest dtype to store start positions (to lower memory
    # usage).
    num_seqs = end_index - start_index - (sequence_length * sampling_rate) + 1
    if targets is not None:
        num_seqs = min(num_seqs, len(targets))
    if num_seqs < 2147483647:
        index_dtype = "int32"
    else:
        index_dtype = "int64"

    # Generate start positions
    start_positions = np.arange(0, num_seqs, sequence_stride, dtype=index_dtype)
    if shuffle:
        if seed is None:
            seed = np.random.randint(1e6)
        rng = np.random.RandomState(seed)
        rng.shuffle(start_positions)

    sequence_length = tf.cast(sequence_length, dtype=index_dtype)
    sampling_rate = tf.cast(sampling_rate, dtype=index_dtype)

    positions_ds = tf.data.Dataset.from_tensors(start_positions).repeat()

    # For each initial window position, generates indices of the window elements
    indices = tf.data.Dataset.zip(
        (tf.data.Dataset.range(len(start_positions)), positions_ds)
    ).map(
        lambda i, positions: tf.range(
            positions[i],
            positions[i] + sequence_length * sampling_rate,
            sampling_rate,
        ),
        num_parallel_calls=tf.data.experimental.AUTOTUNE,
    )

    dataset = sequences_from_indices(data, indices, start_index, end_index)
    if targets is not None:
        indices = tf.data.Dataset.zip(
            (tf.data.Dataset.range(len(start_positions)), positions_ds)
        ).map(
            lambda i, positions: positions[i],
            num_parallel_calls=tf.data.experimental.AUTOTUNE,
        )
        target_ds = sequences_from_indices(
            targets, indices, start_index, end_index
        )
        dataset = tf.data.Dataset.zip((dataset, target_ds))
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    if batch_size is not None:
        if shuffle:
            # Shuffle locally at each iteration
            dataset = dataset.shuffle(buffer_size=batch_size * 8, seed=seed)
        dataset = dataset.batch(batch_size)
    else:
        if shuffle:
            dataset = dataset.shuffle(buffer_size=1024, seed=seed)
    return dataset

In [3]:
def sequences_from_indices(array, indices_ds, start_index, end_index):
    dataset = tf.data.Dataset.from_tensors(array[start_index:end_index])
    dataset = tf.data.Dataset.zip((dataset.repeat(), indices_ds)).map(
        lambda steps, inds: tf.gather(steps, inds),
        num_parallel_calls=tf.data.experimental.AUTOTUNE,
    )
    return dataset

Instantiate an array with 100 values from 0 to 99.
As a result each value corresponds to its own index position.
Create sequences of length 9 with a stride of 8 in batches of 4.
Each tensor now contains an array of 4 arrays (batch_size).
Each respective array has the size 8 (sequence_length).
The first value of the first array is 0 (first index).
The second array starts with 8 (first index + sequence_stride).
The next array always starts with the last index + sequence stride (16,24,32...).

In [4]:
# sequence_stide is equal to taking chunks
s = slice(0, 8)
arr = np.array(list(range(0,100)))
ds = timeseries_dataset_from_array(data = arr, targets= None, sequence_length=9, sequence_stride=8, shuffle=False, batch_size=4)
count = 0
for batch in ds:
  test = batch[:, s]
  print(test)
  count += test.shape[0]

print('# Rows over all Batches: ', count)
print('# Batches: ', len(ds))

tf.Tensor(
[[ 0  1  2  3  4  5  6  7]
 [ 8  9 10 11 12 13 14 15]
 [16 17 18 19 20 21 22 23]
 [24 25 26 27 28 29 30 31]], shape=(4, 8), dtype=int32)
tf.Tensor(
[[32 33 34 35 36 37 38 39]
 [40 41 42 43 44 45 46 47]
 [48 49 50 51 52 53 54 55]
 [56 57 58 59 60 61 62 63]], shape=(4, 8), dtype=int32)
tf.Tensor(
[[64 65 66 67 68 69 70 71]
 [72 73 74 75 76 77 78 79]
 [80 81 82 83 84 85 86 87]
 [88 89 90 91 92 93 94 95]], shape=(4, 8), dtype=int32)
# Rows over all Batches:  12
# Batches:  3


In [5]:
from model_helper import ModelHelper

importing Jupyter notebook from model_helper.ipynb


In [6]:
df = pd.read_csv("C:/Users/WoodPecker/Documents/Privat/HTW-Master/Sem3/PA/HumanMobilityPredictionMA/ma_results/trips_with_zones_final.csv")

Only use the first 10000000 rows.

In [7]:
df = df.head(10000000)
df.head(10)

Unnamed: 0,medallion,pickup_week_day,pickup_hour,pickup_day,pickup_month,dropoff_week_day,dropoff_hour,dropoff_day,dropoff_month,pickup_location_id,dropoff_location_id
0,00005007A9F30E289E760362F69E4EAD,1,0,1,1,1,0,1,1,162.0,262.0
1,00005007A9F30E289E760362F69E4EAD,1,0,1,1,1,0,1,1,262.0,239.0
2,00005007A9F30E289E760362F69E4EAD,1,0,1,1,1,1,1,1,239.0,236.0
3,00005007A9F30E289E760362F69E4EAD,1,1,1,1,1,1,1,1,236.0,41.0
4,00005007A9F30E289E760362F69E4EAD,1,1,1,1,1,1,1,1,41.0,211.0
5,00005007A9F30E289E760362F69E4EAD,1,1,1,1,1,2,1,1,211.0,238.0
6,00005007A9F30E289E760362F69E4EAD,1,2,1,1,1,2,1,1,238.0,142.0
7,00005007A9F30E289E760362F69E4EAD,1,2,1,1,1,2,1,1,142.0,263.0
8,00005007A9F30E289E760362F69E4EAD,1,2,1,1,1,3,1,1,263.0,48.0
9,00005007A9F30E289E760362F69E4EAD,1,3,1,1,1,3,1,1,48.0,246.0


Instantiate the ModelHelper for sequences of length 128.

In [8]:
mh = ModelHelper(df, 129)

In [9]:
mh.df_to_location_sequence()
mh.df

Unnamed: 0,index,location_id,day,month,hour_sin,hour_cos,week_day_sin,week_day_cos,weekend
0,0,162.0,1,1,0.000000,1.000000e+00,0.781831,0.623490,0
1,12,230.0,1,1,0.707107,7.071068e-01,0.781831,0.623490,0
2,13,125.0,1,1,0.707107,7.071068e-01,0.781831,0.623490,0
3,15,48.0,1,1,0.866025,5.000000e-01,0.781831,0.623490,0
4,18,170.0,1,1,1.000000,6.123234e-17,0.781831,0.623490,0
...,...,...,...,...,...,...,...,...,...
13731996,7284341,161.0,26,1,-0.500000,-8.660254e-01,-0.974928,-0.222521,1
13731997,7284341,161.0,26,1,-0.500000,-8.660254e-01,-0.974928,-0.222521,1
13731998,7284342,132.0,26,1,-0.707107,-7.071068e-01,-0.974928,-0.222521,1
13731999,7284343,141.0,26,1,-0.866025,-5.000000e-01,-0.974928,-0.222521,1


Set the vocabulary size.

In [10]:
mh.set_target_column_name('location_id')
mh.set_vocab_size()
mh.vocab_size

264

Do a basic train/test/val split.

In [11]:
mh.basic_split_df()
mh.df_train

Unnamed: 0,index,location_id,day,month,hour_sin,hour_cos,week_day_sin,week_day_cos,weekend
0,0,162.0,1,1,0.000000,1.000000e+00,0.781831,0.623490,0
1,12,230.0,1,1,0.707107,7.071068e-01,0.781831,0.623490,0
2,13,125.0,1,1,0.707107,7.071068e-01,0.781831,0.623490,0
3,15,48.0,1,1,0.866025,5.000000e-01,0.781831,0.623490,0
4,18,170.0,1,1,1.000000,6.123234e-17,0.781831,0.623490,0
...,...,...,...,...,...,...,...,...,...
8788475,3683176,65.0,30,1,-0.965926,2.588190e-01,0.974928,-0.222521,0
8788476,3683177,229.0,30,1,-0.965926,2.588190e-01,0.974928,-0.222521,0
8788477,3683178,160.0,30,1,-0.965926,2.588190e-01,0.974928,-0.222521,0
8788478,3683179,83.0,30,1,-0.866025,5.000000e-01,0.974928,-0.222521,0


For simplicity, all columns but the location_id are dropped.

In [12]:
mh.drop_all_but_target()
mh.df_train

Unnamed: 0,location_id
0,162
1,230
2,125
3,48
4,170
...,...
8788475,65
8788476,229
8788477,160
8788478,83


Set the "batch_size" and instantiate the window generator.

In [13]:
BATCH_SIZE = 128
mh.set_batch_size(BATCH_SIZE)
mh.set_window_generator(['location_id'])

sequence_stride is set to 43 in order to create approximately 3 times the amount of sequences as normally (as in using chunks).
This is still only 1/43rd of the amount of rows that would be created when using sequence_stride=1.

In [14]:
SEQUENCE_STRIDE = 43
mh.make_windowed_dataset(SEQUENCE_STRIDE)
mh.train_dataset

<MapDataset shapes: ((128, 128, 1), (128, 1, 1)), types: (tf.float32, tf.float32)>

In [15]:
mh.train_dataset.element_spec

(TensorSpec(shape=(128, 128, 1), dtype=tf.float32, name=None),
 TensorSpec(shape=(128, 1, 1), dtype=tf.float32, name=None))

A complex model is defined that uses embedding of the one-hot-encoded target features and SequenceFeature Layers.
This is supposed to improve the prediction quality by a lot (see NYC Part 3 Evaluation).

In [16]:
EMBEDDING_DIM = 256

In [17]:
# Declare the dictionary for the places sequence as before
sequence_input = {
  'location_id': tf.keras.Input((mh.sequence_length,), dtype=tf.dtypes.int32, batch_size=BATCH_SIZE, name='location_id')
}

# Handling the categorical feature sequence using one-hot
places_one_hot = feature_column.sequence_categorical_column_with_vocabulary_list(
  'location_id', [i for i in range(int(mh.vocab_size))])

# Embed the one-hot encoding
places_embed = feature_column.embedding_column(places_one_hot, EMBEDDING_DIM)

sequence_features, sequence_length = tf.keras.experimental.SequenceFeatures(places_embed)(sequence_input)
sequence_features = tf.ensure_shape(sequence_features, (mh.batch_size, mh.sequence_length, EMBEDDING_DIM))

gru1 = tf.keras.layers.GRU(256,
                           return_sequences=True,
                           input_shape=(BATCH_SIZE, mh.sequence_length, EMBEDDING_DIM),
                           stateful=True,
                           recurrent_initializer='glorot_uniform')(sequence_features)
gru2 = tf.keras.layers.GRU(64,
                           input_shape=(BATCH_SIZE, mh.sequence_length, EMBEDDING_DIM),
                           stateful=True,
                           recurrent_initializer='glorot_uniform')(gru1)

#drop = tf.keras.layers.Dropout(0.3)(gru2)
#dense = tf.keras.layers.Dense(number_of_places, activation='softmax')(drop)

dense = tf.keras.layers.Dense(mh.vocab_size)(gru2)
output = tf.keras.layers.Softmax()(dense)

model = tf.keras.Model(inputs=list(sequence_input.values()), outputs=output)

In [18]:
mh.assign_model(model)
mh.set_num_epochs(5)
mh.compile_model(optimizer_type=tf.keras.optimizers.Adam, learning_rate=0.002)
mh.fit_model(with_early_stopping=False)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\WoodPecker\Documents\Privat\HTW-Master\Sem3\PA\HumanMobilityPredictionMA\venv\lib\site-packages\IPython\core\interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\WoodPecker\AppData\Local\Temp\ipykernel_22716\16564970.py", line 4, in <module>
    mh.fit_model(with_early_stopping=False)
  File "<string>", line 7, in fit_model
  File "C:\Users\WoodPecker\Documents\Privat\HTW-Master\Sem3\PA\HumanMobilityPredictionMA\venv\lib\site-packages\tensorflow\python\keras\engine\training.py", line 103, in _method_wrapper
    return method(self, *args, **kwargs)
  File "C:\Users\WoodPecker\Documents\Privat\HTW-Master\Sem3\PA\HumanMobilityPredictionMA\venv\lib\site-packages\tensorflow\python\keras\engine\training.py", line 1093, in fit
    tmp_logs = train_function(iterator)
  File "C:\Users\WoodPecker\Documents\Privat\HTW-Master\Sem3\PA\HumanMobilityPredictionMA\venv\lib\site-packages\te

TypeError: object of type 'NoneType' has no len()

In [None]:
mh.evaluate_model()

As assumed, when using more heterogeneous data (not only a small interpolated subset), the prediction quality increases.
This proves the point that the sliding window data interpolation is only useful in cases of missing data and can not be applied in every case.
The prediction quality plummets when it is used to create a large dataset.
The interpolated sequencing does not help when the present data does not contain enough patterns for the network to draw conclusions from.

In [None]:
take = mh.train_dataset.take(1)
#predict y
#prediction = model.predict(take)

data = take.unbatch()
X_seq = []
Y_true_seq = []

for x,y in data:
    #print('<y>: ', y.numpy())
    X_seq.append(x.numpy()[0][0])
    Y_true_seq.append(y.numpy()[0][0])

print('X len: ', len(X_seq))
print('y Shape: ', y.shape)

print('Y_true_seq len: ', len(Y_true_seq))
print('X_seq len: ', len(X_seq))

In [None]:
#test for SCCE

prediction = model.predict(take)

scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)

scce(Y_true_seq, prediction).numpy()

In [None]:
x_y_seq = np.array([np.append(x[0], y[0][0]) for x, y in mh.train_dataset])
x_y_seq

In [None]:
len(x_y_seq)

In [None]:
X_loc = []
Y = []
for x, y in mh.test_dataset:
    X_loc.append(x.numpy())
    Y.append(y.numpy())

X_loc = np.array(X_loc)
Y = np.array(Y)

X_loc = np.reshape(X_loc, (X_loc.shape[0], X_loc.shape[1], X_loc.shape[2]))
Y = np.reshape(Y, (Y.shape[0], Y.shape[1], Y.shape[2]))
print('X_loc.shape ', X_loc.shape)
print('Y.shape ', Y.shape)
X_Y = np.concatenate((X_loc, Y), axis=2)
print('X_Y.shape: ', X_Y.shape)
X_Y[:10]

The manual evaluation of the results (prediction, scce) does confirm the conclusions.