In [None]:
%matplotlib inline

import h5py
import matplotlib.pyplot as plt
import numpy as np
np.set_printoptions(precision=4, suppress=True)
import pandas as pd
import xarray as xr

In [None]:
filepath = '../../data/datasets/ngsim_feature_trajectories.h5'
infile = h5py.File(filepath, 'r')

In [None]:
x = np.copy(infile['1'].value)

In [None]:
x.shape

In [None]:
feature_names = infile.attrs['feature_names']

In [None]:
feature_names

In [None]:
def compute_lengths(arr):
    sums = np.sum(np.array(arr), axis=2)
    lengths = []
    for sample in sums:
        zero_idxs = np.where(sample == 0.)[0]
        if len(zero_idxs) == 0:
            lengths.append(len(sample))
        else:
            lengths.append(zero_idxs[0])
    return lengths


In [None]:
lengths = compute_lengths(x)

fidx = 18
censor = 100.
feats = []
for (i,l) in enumerate(lengths):
    cur_feats = np.copy(x[i,:l,fidx])

#     valid_idxs = np.where(cur_feats != censor)[0]
#     invalid_idxs = set(np.where(cur_feats == censor)[0])
    
#     for (prev, cur) in zip(valid_idxs, valid_idxs[1:]):
#         if cur - prev == 2:
#             idx = (cur + prev) // 2
#             if idx in invalid_idxs:
#                 val = (cur_feats[prev] + cur_feats[cur]) / 2
#                 cur_feats[idx] = val
                
#     for j in range(1, l-1):

#         if cur_feats[j] == censor and cur_feats[j-1] != censor and cur_feats[j+1] != censor:
#             cur_feats[j] = (cur_feats[j-1] + cur_feats[j+1]) / 2
            
#         if cur_feats[j] == censor and x[i,j,fidx+1] != censor and x[i,j,fidx-1] != censor:
#             cur_feats[j] = (x[i,j,fidx+1] + x[i,j,fidx-1]) / 2
        
#     invalid_idxs = set(np.where(cur_feats == censor)[0])
#     if len(invalid_idxs) / l < .2:
    feats.append(cur_feats)
    
    
    
z = np.concatenate(feats)
z.shape

In [None]:
plt.hist(z,100)

In [None]:
def compute_lengths(arr):
    sums = np.sum(np.array(arr), axis=2)
    lengths = []
    for sample in sums:
        zero_idxs = np.where(sample == 0.)[0]
        if len(zero_idxs) == 0:
            lengths.append(len(sample))
        else:
            lengths.append(zero_idxs[0])
    return lengths

def load_ngsim_trajectory_data(
        filepath, 
        traj_key=None,
        feature_keys=[
            'velocity',
            'relative_offset',
            'relative_heading',
            'length',
            'width',
            'lane_curvature',
            'markerdist_left',
            'markerdist_right',
            'accel',
            'jerk',
            'turn_rate_frenet',
            'angular_rate_frenet'
        ],
        target_keys=[
            'lidar_10'
        ],
        binedges=[10,15,25,50],
        max_censor_ratio=.3,
        max_len=None,
        train_ratio=.9,
        max_samples=None,
        shuffle=True,
        normalize=True,
        censor=50.):
    # select from the different roadways
    infile = h5py.File(filepath, 'r')
    if traj_key is None:
        x = np.vstack([infile[k].value for k in infile.keys()])
    else:
        x = np.copy(infile[traj_key].value)

    # enforce max_len
    if max_len is not None:
        x = x[:,:max_len,:]
    
    # pandas format for feature-based selection
    panel = pd.Panel(
        data=x, 
        minor_axis=infile.attrs['feature_names']
    )
    
    lengths = np.array(compute_lengths(panel[:,:,feature_keys]))
    n_samples, n_timesteps, input_dim = panel[:,:,feature_keys].shape

    # only a single target key implemented for now
    assert len(target_keys) == 1
    k = target_keys[0]
        
    # remove samples with too many censored values
    valid_sample_idxs = []
    for j, l in enumerate(lengths):
        invalid_idxs = np.where(panel[j,:l,k] == censor)[0]
        if len(invalid_idxs) / l < max_censor_ratio:
            valid_sample_idxs.append(j)
    valid_sample_idxs = np.array(valid_sample_idxs)
            
    # debugging size
    if max_samples is not None:
        valid_sample_idxs = valid_sample_idxs[:max_samples]
    
    # shuffle
    if shuffle:
        permute_idxs = np.random.permutation(len(valid_sample_idxs))
        valid_sample_idxs = valid_sample_idxs[permute_idxs]

    y = np.zeros((len(valid_sample_idxs), n_timesteps), dtype=int)
    lengths = lengths[valid_sample_idxs]
    x = np.array(panel[valid_sample_idxs,:,feature_keys])
    
    # discretize the targets
    y[:,:] = np.digitize(
        panel[valid_sample_idxs,:,k].T, 
        binedges, 
        right=True
    )
        
    # normalize features
    if normalize:
        x -= np.mean(x, axis=(1,2), keepdims=True)
        x /= np.std(x, axis=(1,2), keepdims=True) + 1e-8
    
    # train / val split
    train_idx = int(len(valid_sample_idxs) * train_ratio)
    train_x = x[:train_idx]
    train_y = y[:train_idx]
    train_lengths = lengths[:train_idx]
    val_x = x[train_idx:]
    val_y = y[train_idx:]
    val_lengths = lengths[train_idx:]
    
    data = dict(
        train_x=train_x,
        train_y=train_y,
        train_lengths=train_lengths,
        val_x=val_x,
        val_y=val_y,
        val_lengths=val_lengths,
        feature_names=feature_keys,
        target_names=target_keys,
    )
    
    return data

In [None]:
data_1 = load_ngsim_trajectory_data(filepath, target_keys=['lidar_1'])

In [None]:
data_5 = load_ngsim_trajectory_data(filepath, target_keys=['lidar_5'])

In [None]:
data_10 = load_ngsim_trajectory_data(filepath, target_keys=['lidar_10'])

In [None]:
def compute_changes(arr):
    total, count = 0, 0
    for row in arr:
        for prev, cur in zip(row, row[1:]):
            total += 1
            if prev != cur:
                count += 1
    return count / total

In [None]:
print(compute_changes(data_1['train_y']))
print(compute_changes(data_5['train_y']))
print(compute_changes(data_10['train_y']))

In [None]:
print(len(data['train_y']))
print(len(data['train_x']))

In [None]:
for i in range(5):
    print('i: {} #: {}'.format(i, len(np.where(data['train_y'] == i)[0])))

In [None]:
np.size(data['train_y'])

In [None]:
10563 / 239238

In [None]:
same = tf.equal(tf.cast(np.argmax(self.scores, axis=-1), tf.int32), self.targets)
        same = tf.cast(same, tf.float32) * self.sequence_mask
        self.acc = tf.reduce_sum(same) / tf.reduce_sum(tf.cast(self.lengths, tf.float32))

In [None]:
import tensorflow as tf

In [None]:

scores = np.array([[1,2],[2,1],[1,2]])
targets = np.array([1,0,0])
same = tf.equal(tf.cast(np.argmax(scores, axis=-1), tf.int32), self.targets)

In [None]:
import sklearn.metrics

In [None]:
t = [0,1,2]
p = [0,0,0]
print(sklearn.metrics.precision_recall_fscore_support(t, p, average='micro'))

In [None]:
list(range(2,5))+ [1,1,1]

In [None]:
list(np.random.randint(low=0, high=5, size=10))

In [None]:
def compute_batch_idxs(start, batch_size, size):
    if start >= size:
        return list(np.random.randint(low=0, high=size, size=batch_size))
    
    end = start + batch_size

    if end <= size:
        return list(range(start, end))

    else:
        base_idxs = list(range(start, size))
        remainder = end - size
        idxs = list(np.random.randint(low=0, high=size, size=remainder))
        return base_idxs + idxs

In [None]:
compute_batch_idxs(7, 1, 8)

In [None]:
np.random.randint(0,2,10)

In [None]:
import collections

In [None]:
ctr = collections.Counter([1,2,3,4,4])
ctr.most_common(1)

In [57]:
import sklearn.dummy
import numpy as np
import sklearn.metrics

c = sklearn.dummy.DummyClassifier('stratified')
nclasses = 5
targets = np.random.randint(0,nclasses,size=1000)
c.fit(None, targets)
preds = c.predict(targets.reshape(-1,1))
sklearn.metrics.precision_recall_fscore_support(targets, preds, average=None)


(array([ 0.16766467,  0.22580645,  0.17708333,  0.21296296,  0.19230769]),
 array([ 0.15384615,  0.24019608,  0.17708333,  0.2081448 ,  0.19900498]),
 array([ 0.16045845,  0.2327791 ,  0.17708333,  0.21052632,  0.19559902]),
 array([182, 204, 192, 221, 201]))