In [1]:
path = "data/german_1/german_1.txt"
output_dir = "output/"

In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [3]:
df = pd.read_csv(path, sep=';')
df.time = pd.to_datetime(df.time)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1867920 entries, 0 to 1867919
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   tag_id       object        
 1   time         datetime64[ns]
 2   x            float64       
 3   y            float64       
 4   description  object        
dtypes: datetime64[ns](1), float64(2), object(2)
memory usage: 71.3+ MB


In [5]:
df.head()

Unnamed: 0,tag_id,time,x,y,description
0,0x00205FFF28DB,2019-08-31 19:58:06,39.97,21.8,Cart
1,0x00205FFF28DB,2019-08-31 19:58:02,40.76,18.68,Cart
2,0x00205FFF28DB,2019-08-31 19:57:45,40.76,5.42,Cart
3,0x00205FFF28DB,2019-08-31 19:57:41,40.76,3.39,Cart
4,0x00205FFF28DB,2019-08-31 19:57:10,40.76,2.75,Cart


In [6]:
df.duplicated(keep=False).sum()

376

In [7]:
df = df[~df.duplicated(keep="first")]

In [8]:
df.duplicated(subset=["tag_id","time"],keep="first").sum()

86185

In [9]:
df = df[~df.duplicated(subset=["tag_id","time"],keep="first")]

In [10]:
df.shape

(1781547, 5)

In [11]:
df = df.sort_values(by="time")

# Generating Trajectories

In [21]:
'''
Generate the  trajectories using the raw data

params:
Data: Dataframe, Default = None
resting_threshold: float, Default = inf, Threshold to determine when to start new trajectory after resting X min.
traj_min_len: int, Default = float, Minimum length of trajectories to be considered.

Note: The new trajectory start with the new day
'''
def generateTrajectories(data, resting_threshold = np.inf, traj_min_len = 0):
    data = data.sort_values(by="time")
    resting_threshold_sec = resting_threshold * 60 #convert minutes to seconds
    
    traj_id = 1
    traj = []
    curr_traj = [] #to make the current traj
    
    for tag_id in tqdm(data.tag_id.unique()):
        tag_id_data = data[data.tag_id == tag_id]
        
        start = True #To check if the new trajectory is starting or not
        curr_traj = [{"tag_id": tag_id_data.iloc[0]["tag_id"], "time":  tag_id_data.iloc[0]["time"], "x":  tag_id_data.iloc[0]["x"],
                            "y":  tag_id_data.iloc[0]["y"], "description":  tag_id_data.iloc[0]["description"], "traj_id": traj_id, "velocity": (0,0), "type": "start"}]
        
        for index in tqdm(range(1, tag_id_data.shape[0],1)):
            time_diff = (tag_id_data.iloc[index]["time"] - tag_id_data.iloc[index-1]["time"]).total_seconds() #time difference
            velocity_x = (tag_id_data.iloc[index]["x"] -  tag_id_data.iloc[index-1]["x"]) / time_diff #calculate the x velocity
            velocity_y = (tag_id_data.iloc[index]["y"] -  tag_id_data.iloc[index-1]["y"]) / time_diff #calculate the y velocity
                
            if (tag_id_data.iloc[index]["y"] < 0 or tag_id_data.iloc[index]["x"] < 0 ) or (tag_id_data.iloc[index-1]["time"].date() < tag_id_data.iloc[index]["time"].date()) or time_diff > resting_threshold_sec:  
                curr_traj[-1]["type"] = "end" #last one to end
                if len(curr_traj) >= traj_min_len:
                    traj_id += 1 #increment trajectory id
                    traj.extend(curr_traj)
                    
                curr_traj = [{"tag_id": tag_id_data.iloc[index]["tag_id"], "time":  tag_id_data.iloc[index]["time"], "x":  tag_id_data.iloc[index]["x"],
                            "y":  tag_id_data.iloc[index]["y"], "description":  tag_id_data.iloc[index]["description"], "traj_id": traj_id, "velocity": (0,0), "type": "start"}]
            else:
                curr_traj.append({"tag_id": tag_id_data.iloc[index]["tag_id"], "time":  tag_id_data.iloc[index]["time"], "x":  tag_id_data.iloc[index]["x"],
                            "y":  tag_id_data.iloc[index]["y"], "description":  tag_id_data.iloc[index]["description"], "traj_id": traj_id, "velocity": (velocity_x,velocity_y), 
                             "type": "intermediate"})
        curr_traj[-1]["type"] = "end" #last one to end
        if len(curr_traj) >= traj_min_len:
            traj.extend(curr_traj)
        
    csv = pd.DataFrame.from_dict(traj) #convert the dictionary to dataframe
    csv.to_csv(f"{output_dir}/tajectories_{resting_threshold}mins_{traj_min_len}len.csv", sep=";", index=False) #save the dataframe to file

In [22]:
generateTrajectories(df, 0.0833333, 20) #5 sec

  0%|          | 0/43 [00:00<?, ?it/s]

  0%|          | 0/79813 [00:00<?, ?it/s]

  0%|          | 0/23868 [00:00<?, ?it/s]

  0%|          | 0/20828 [00:00<?, ?it/s]

  0%|          | 0/66925 [00:00<?, ?it/s]

  0%|          | 0/76526 [00:00<?, ?it/s]

  0%|          | 0/78757 [00:00<?, ?it/s]

  0%|          | 0/43376 [00:00<?, ?it/s]

  0%|          | 0/26030 [00:00<?, ?it/s]

  0%|          | 0/63214 [00:00<?, ?it/s]

  0%|          | 0/70375 [00:00<?, ?it/s]

  0%|          | 0/53583 [00:00<?, ?it/s]

  0%|          | 0/46371 [00:00<?, ?it/s]

  0%|          | 0/39926 [00:00<?, ?it/s]

  0%|          | 0/65331 [00:00<?, ?it/s]

  0%|          | 0/61787 [00:00<?, ?it/s]

  0%|          | 0/35729 [00:00<?, ?it/s]

  0%|          | 0/62975 [00:00<?, ?it/s]

  0%|          | 0/81205 [00:00<?, ?it/s]

  0%|          | 0/60061 [00:00<?, ?it/s]

  0%|          | 0/12494 [00:00<?, ?it/s]

  0%|          | 0/16287 [00:00<?, ?it/s]

  0%|          | 0/45317 [00:00<?, ?it/s]

  0%|          | 0/65556 [00:00<?, ?it/s]

  0%|          | 0/67749 [00:00<?, ?it/s]

  0%|          | 0/25570 [00:00<?, ?it/s]

  0%|          | 0/23750 [00:00<?, ?it/s]

  0%|          | 0/11553 [00:00<?, ?it/s]

  0%|          | 0/52051 [00:00<?, ?it/s]

  0%|          | 0/40028 [00:00<?, ?it/s]

  0%|          | 0/12560 [00:00<?, ?it/s]

  0%|          | 0/25664 [00:00<?, ?it/s]

  0%|          | 0/24877 [00:00<?, ?it/s]

  0%|          | 0/671 [00:00<?, ?it/s]

  0%|          | 0/36956 [00:00<?, ?it/s]

  0%|          | 0/41311 [00:00<?, ?it/s]

  0%|          | 0/366 [00:00<?, ?it/s]

  0%|          | 0/68373 [00:00<?, ?it/s]

  0%|          | 0/66051 [00:00<?, ?it/s]

  0%|          | 0/31941 [00:00<?, ?it/s]

  0%|          | 0/52589 [00:00<?, ?it/s]

  0%|          | 0/184 [00:00<?, ?it/s]

  0%|          | 0/405 [00:00<?, ?it/s]

  0%|          | 0/2521 [00:00<?, ?it/s]

In [65]:
def interpolateData(df):
    df.time = pd.to_datetime(df.time)
    df = df.set_index("time")
    x = df.x.resample("1s").mean()
    y = df.y.resample("1s").mean()
    t = df.type = df.type.resample("1s").last()
    df = df.resample("1s").ffill()
    df.x = x
    df.y = y
    df.type = t
    df.type = df.type.fillna("intermediate")
    df = df.interpolate()
    return df.reset_index()

In [50]:
data = pd.read_csv("output/tajectories_0.0833333mins_20len.csv", sep=";")

In [74]:
pd.options.mode.chained_assignment = None
uniqueTrajectories = data.traj_id.unique()
interpolatedDataframe = pd.DataFrame(columns=data.columns)
for traj_id in tqdm(uniqueTrajectories[:]):
    interpolatedDataframe = pd.concat([interpolatedDataframe, interpolateData(data[data.traj_id == traj_id])],axis=0)

  0%|          | 0/16560 [00:00<?, ?it/s]

In [75]:
interpolatedDataframe

Unnamed: 0,tag_id,time,x,y,description,traj_id,velocity,type
0,0x002060000F45,2019-08-01 08:00:00,8.5600,10.4400,Cart,1,"(0, 0)",start
1,0x002060000F45,2019-08-01 08:00:01,7.5100,10.5050,Cart,1,"(0, 0)",intermediate
2,0x002060000F45,2019-08-01 08:00:02,6.4600,10.5700,Cart,1,"(-1.0500000000000003, 0.06500000000000039)",intermediate
3,0x002060000F45,2019-08-01 08:00:03,5.7900,10.5600,Cart,1,"(-0.6699999999999999, -0.009999999999999787)",intermediate
4,0x002060000F45,2019-08-01 08:00:04,4.8650,10.5450,Cart,1,"(-0.6699999999999999, -0.009999999999999787)",intermediate
...,...,...,...,...,...,...,...,...
27,0x002060003578,2019-08-31 10:10:12,15.2850,19.3450,Cart,16560,"(-0.4850000000000003, -0.13000000000000078)",intermediate
28,0x002060003578,2019-08-31 10:10:13,15.1925,19.2825,Cart,16560,"(-0.4850000000000003, -0.13000000000000078)",intermediate
29,0x002060003578,2019-08-31 10:10:14,15.1000,19.2200,Cart,16560,"(-0.09250000000000025, -0.0625)",intermediate
30,0x002060003578,2019-08-31 10:10:15,15.2100,19.1450,Cart,16560,"(-0.09250000000000025, -0.0625)",intermediate


In [76]:
temp = interpolatedDataframe.sort_values(by="time")
temp.to_csv("output/resampling_upscale_0.0833333mins_20len.csv",sep=";", index=False)

In [3]:
data = pd.read_csv("output/resampling_upscale_0.0833333mins_20len.csv",sep=";")

In [10]:
labels, idx = pd.factorize(data.time)

In [11]:
newDataframe = pd.DataFrame(columns=["Frame", "Ped_ID", "x","y"])

In [12]:
newDataframe.Frame = labels

In [13]:
newDataframe.Ped_ID = data.traj_id

In [14]:
newDataframe.x = data.x
newDataframe.y = data.y

In [17]:
newDataframe = newDataframe.sort_values(by="Frame")

In [19]:
newDataframe.to_csv("output/resampling_upscale_0.0833333mins_20len_FORMATED.csv",header=False, index=False, sep="\t")

In [20]:
newDataframe

Unnamed: 0,Frame,Ped_ID,x,y
0,0,1,8.560,10.440
1,1,1,7.510,10.505
2,2,1,6.460,10.570
3,3,1,5.790,10.560
4,4,1,4.865,10.545
...,...,...,...,...
801408,562304,15795,26.060,11.440
801409,562305,15795,27.110,11.800
801410,562306,15795,28.130,11.330
801411,562307,15795,28.575,11.290


In [35]:
filtered_data = newDataframe.groupby(by="Frame").filter(lambda x: x.Frame.count()>1)

In [37]:
filtered_data.to_csv("output/resampling_upscale_0.0833333mins_20len_FORMATED_FILTERED.csv",header=False, index=False, sep="\t")