In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import math
from numpy.random import default_rng
import time

from sklearn import preprocessing

In [2]:

def read_data_from_file(file_name, split=1):
    
    train_file = file_name
    df = pd.read_csv(train_file, sep=',')

    # Select a percentage from full dataset (used in debugging)
    df = df.head(math.floor(df.shape[0]*split))

    return df


def filter_data(df):
    
    # Select cars only
    df = df.loc[df['agent_type'] == "car"]
    return df

def filter_length(df):            
     
    # Select only trajectories with the same length=40 and in the same time phase
 
    g = df.groupby(["object_id"])
 
    g_equal_len = g.filter(lambda x: x['frame_id'].count() == 40) # This works because the recordings are done with a timestamp_max =4000   # and (x['timestamp_ms'].ge(100).any() and x['timestamp_ms'].le(4000).any()))

    return g_equal_len


In [3]:
def preprocess_dataframe(df):
    # Create a synthetic index variable for easier indexing and search
    df['object_id']=df['case_id'].astype(int).astype(str)+"-"+df['track_id'].astype(str)

    # Convert to unique object identifier 
    le = preprocessing.LabelEncoder()
    df['object_id'] = le.fit_transform(df['object_id'])

    # Select only vehicles
    df = filter_data(df)

    # Select only same length sequences
    df = filter_length(df)

    # Order by object_id
    group = df.groupby(["object_id"])
    cars = list()

    for ix,seq in group:
        sub_seq = seq.to_numpy()
        cars.append(sub_seq)

    cars = np.array(cars)
    cars = cars.reshape(-1,cars.shape[2])   
    df = pd.DataFrame(data = cars, columns = df.columns)

    df = df.iloc[0::2]  #sub-sampling to 5hz, skip one row

    # Add a sequential timestamp required by GluonTS library
    #timestamp = pd.date_range("2023-01-01", periods=df.shape[0], freq="S")
    #df['timestamp'] = timestamp

    # Keep only important features
    df = df.drop(columns=['track_id','timestamp_ms','agent_type','length','width','frame_id'])

    # Reorder columns
    df = df[['case_id', 'object_id','x','y', 'vx', 'vy', 'psi_rad']]

    return df


In [4]:
# Read train dataset
df = read_data_from_file('/content/drive/Othercomputers/My Laptop/github-repositories/diffusion-trajectory-forecasting/data/DR_USA_Intersection_MA/DR_USA_Intersection_MA.csv', split=1)
df = preprocess_dataframe(df)
df.to_csv('/content/drive/Othercomputers/My Laptop/github-repositories/diffusion-trajectory-forecasting/data/DR_USA_Intersection_MA/_train.csv',  index = False)

  for ix,seq in group:


In [5]:
# Read test dataset
df = read_data_from_file('/content/drive/Othercomputers/My Laptop/github-repositories/diffusion-trajectory-forecasting/data/DR_USA_Intersection_MA/DR_USA_Intersection_MA_val.csv', split=1)
df = preprocess_dataframe(df)
df.to_csv('/content/drive/Othercomputers/My Laptop/github-repositories/diffusion-trajectory-forecasting/data/DR_USA_Intersection_MA/_val.csv',  index = False)

  for ix,seq in group:
