In [1]:
import pandas as pd
# import dgl
import torch
import numpy as np
import math
# Encode categorical features
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

In [2]:
data_folder = '/data/home/umang/Trajectory_project/anomaly_traj_data/numosim/'
train_file_name = 'stay_points_train.parquet'
poi_filename = 'poi.parquet'
train_df = pd.read_parquet(f'{data_folder}/{train_file_name}')
poi_df = pd.read_parquet(f'{data_folder}/{poi_filename}')
train_poi_df = pd.merge(train_df, poi_df, on='poi_id')

In [3]:
# Convert all column names to lowercase
train_df.columns = train_df.columns.str.lower()

# Display the DataFrame to verify the changes
train_df.head()

Unnamed: 0,agent_id,poi_id,start_datetime,end_datetime
0,1,1518791,2024-01-01 00:00:00-08:00,2024-01-01 11:36:59-08:00
1,1,1912553,2024-01-01 11:51:32-08:00,2024-01-01 12:33:47-08:00
2,1,1518791,2024-01-01 12:48:23-08:00,2024-01-01 13:06:35-08:00
3,1,103611,2024-01-01 13:22:50-08:00,2024-01-01 14:05:50-08:00
4,1,1518791,2024-01-01 14:21:28-08:00,2024-01-01 15:22:44-08:00


In [4]:
train_df

Unnamed: 0,agent_id,poi_id,start_datetime,end_datetime
0,1,1518791,2024-01-01 00:00:00-08:00,2024-01-01 11:36:59-08:00
1,1,1912553,2024-01-01 11:51:32-08:00,2024-01-01 12:33:47-08:00
2,1,1518791,2024-01-01 12:48:23-08:00,2024-01-01 13:06:35-08:00
3,1,103611,2024-01-01 13:22:50-08:00,2024-01-01 14:05:50-08:00
4,1,1518791,2024-01-01 14:21:28-08:00,2024-01-01 15:22:44-08:00
...,...,...,...,...
17261433,200000,329705,2024-01-27 08:24:08-08:00,2024-01-27 09:24:42-08:00
17261434,200000,408965,2024-01-27 09:38:34-08:00,2024-01-27 10:19:04-08:00
17261435,200000,402449,2024-01-27 10:30:15-08:00,2024-01-27 13:07:01-08:00
17261436,200000,337944,2024-01-27 13:21:36-08:00,2024-01-27 14:09:46-08:00


In [5]:
train_poi_df.head()
# get the number of unique values in each column
cat_cols = ['latitude', 'longitude', 'poi_id', 'name']
cat_dims = [train_poi_df[col].nunique() for col in cat_cols]

In [6]:
cat_dims

[686406, 686408, 800485, 237362]

In [7]:
#Create a unique node index for each visit
train_poi_df = train_poi_df.reset_index(drop=True)
train_poi_df['node_idx'] = train_poi_df.index

In [8]:
train_poi_df

Unnamed: 0,agent_id,poi_id,start_datetime,end_datetime,name,latitude,longitude,act_types,node_idx
0,1,1518791,2024-01-01 00:00:00-08:00,2024-01-01 11:36:59-08:00,residence,34.041928,-118.338327,"[1, 11, 15]",0
1,1,1518791,2024-01-01 12:48:23-08:00,2024-01-01 13:06:35-08:00,residence,34.041928,-118.338327,"[1, 11, 15]",1
2,1,1518791,2024-01-01 14:21:28-08:00,2024-01-01 15:22:44-08:00,residence,34.041928,-118.338327,"[1, 11, 15]",2
3,1,1518791,2024-01-01 17:32:54-08:00,2024-01-02 09:31:12-08:00,residence,34.041928,-118.338327,"[1, 11, 15]",3
4,1,1518791,2024-01-02 13:51:05-08:00,2024-01-03 11:25:38-08:00,residence,34.041928,-118.338327,"[1, 11, 15]",4
...,...,...,...,...,...,...,...,...,...
17261433,200000,2885338,2024-01-24 16:22:26-08:00,2024-01-24 19:28:16-08:00,residence,33.790039,-117.846742,"[1, 11, 15]",17261433
17261434,200000,2885338,2024-01-24 21:06:29-08:00,2024-01-25 06:27:41-08:00,residence,33.790039,-117.846742,"[1, 11, 15]",17261434
17261435,200000,2885338,2024-01-25 15:18:47-08:00,2024-01-26 06:53:26-08:00,residence,33.790039,-117.846742,"[1, 11, 15]",17261435
17261436,200000,2885338,2024-01-26 17:56:49-08:00,2024-01-27 08:08:48-08:00,residence,33.790039,-117.846742,"[1, 11, 15]",17261436


In [9]:
#Print corresponding names for each unique poi_id
poi_names = train_poi_df[['poi_id', 'name']].drop_duplicates()
poi_names



Unnamed: 0,poi_id,name
0,1518791,residence
1752,1912553,residence
2023,103611,Chevron
2324,247124,Roving Karaoke Studio
2418,251512,Dixie Hollywood
...,...,...
17261289,393516,Hastings Oaks Club House
17261290,858075,residence
17261355,156776,Petrozone gas
17261356,758742,residence


In [10]:
# get the number of poi_ids for each unique 'name'
poi_name_counts = train_poi_df.groupby('name')['poi_id'].nunique()

# print the number of poi_ids for each unique 'name' descending
poi_name_counts = poi_name_counts.sort_values(ascending=False)
poi_name_counts

name
residence                                514177
Starbucks                                   564
7-Eleven                                    528
Metabank                                    493
ATM                                         461
                                          ...  
Garden Caf at Pilgrim Place                   1
Garden Cafe at Norton Simon Museum            1
Garden Center                                 1
Garden Chapel (Faith Lutheran Church)         1
                                           1
Name: poi_id, Length: 237362, dtype: int64

In [11]:
# Take differnt entries with name "Starbucks" which have different poi_ids
x= train_poi_df.groupby('name').get_group('Starbucks')

# get unique poi_ids for x and print corresponding entries
x.groupby('poi_id').head(1)

Unnamed: 0,agent_id,poi_id,start_datetime,end_datetime,name,latitude,longitude,act_types,node_idx
10759,11,324157,2024-01-14 20:06:10-08:00,2024-01-14 20:56:53-08:00,Starbucks,34.150546,-118.073148,"[2, 7]",10759
65179,110,325737,2024-01-02 09:10:46-08:00,2024-01-02 10:24:06-08:00,Starbucks,34.125039,-118.057616,"[2, 7]",65179
93927,163,149511,2024-01-23 11:01:51-08:00,2024-01-23 13:23:03-08:00,Starbucks,33.987014,-118.225554,"[2, 7]",93927
109948,193,209026,2024-01-07 18:18:11-08:00,2024-01-07 20:18:19-08:00,Starbucks,34.308749,-118.431488,"[2, 7]",109948
214880,386,356742,2024-01-05 18:16:52-08:00,2024-01-05 19:35:53-08:00,Starbucks,33.902705,-118.055044,"[2, 7]",214880
...,...,...,...,...,...,...,...,...,...
17022681,193678,22641,2024-01-05 11:57:35-08:00,2024-01-05 13:24:42-08:00,Starbucks,33.759034,-117.989295,"[2, 7]",17022681
17034805,193996,276994,2024-01-03 13:06:37-08:00,2024-01-03 13:11:37-08:00,Starbucks,34.234422,-118.255933,"[2, 7]",17034805
17116313,196159,395865,2024-01-03 18:33:24-08:00,2024-01-03 18:38:24-08:00,Starbucks,34.170039,-118.111773,"[2, 7]",17116313
17126662,196434,418176,2024-01-01 16:29:41-08:00,2024-01-01 17:32:12-08:00,Starbucks,34.581061,-118.141162,"[2, 7]",17126662


In [12]:
#get all unique act_Types for train_poi_df
act_types = train_poi_df['act_types']
# Flatten and get unique values
unique_act_types = set(tuple(sorted(sublist)) for sublist in act_types)
unique_act_types

{(0, 2, 9, 15),
 (0, 2, 15),
 (1, 11, 15),
 (2,),
 (2, 3, 15),
 (2, 4, 15),
 (2, 5),
 (2, 5, 7),
 (2, 5, 7, 15),
 (2, 6, 14, 15),
 (2, 6, 15),
 (2, 7),
 (2, 7, 10, 15),
 (2, 8, 15),
 (2, 9),
 (2, 9, 10),
 (2, 9, 15),
 (2, 10, 15),
 (2, 12, 15),
 (2, 14, 15),
 (2, 15),
 (7,),
 (9, 10, 15),
 (9, 15),
 (10,),
 (13,),
 (14,),
 (14, 15)}

In [13]:
# #- Transportation = 0
# - 1 = Home
# - 2 = Work
# - 3 = School
# - 4 = ChildCare
# - 5 = BuyGoods
# - 6 = Services
# - 7 = EatOut
# - 8 = Errands
# - 9 = Recreation
# - 10 = Exercise
# - 11 = Visit
# - 12 = HealthCare
# - 13 = Religious
# - 14 = SomethingElse
# - 15 = DropOff

# Each of the element in 1 act type is one of these numbers. Each number has a corresponding category name.
# For example, 1 corresponds to 'Home', 2 corresponds to 'Work' and so on.

# Map unique act types to corresponding category names
list_of_categories = ['Transportation', 'Home', 'Work', 'School', 'ChildCare', 'BuyGoods', 'Services', 'EatOut', 'Errands', 'Recreation', 'Exercise', 'Visit', 'HealthCare', 'Religious', 'SomethingElse', 'DropOff']
# Instead of numbers replace it with cateogry names for unique act types
act_type_category = {}

for act_type in unique_act_types:
    act_type_category[act_type] = [list_of_categories[i] for i in act_type]

# 
act_type_category

{(2, 4, 15): ['Work', 'ChildCare', 'DropOff'],
 (0, 2, 9, 15): ['Transportation', 'Work', 'Recreation', 'DropOff'],
 (2,): ['Work'],
 (2, 7, 10, 15): ['Work', 'EatOut', 'Exercise', 'DropOff'],
 (2, 8, 15): ['Work', 'Errands', 'DropOff'],
 (14,): ['SomethingElse'],
 (2, 10, 15): ['Work', 'Exercise', 'DropOff'],
 (2, 5, 7): ['Work', 'BuyGoods', 'EatOut'],
 (2, 5): ['Work', 'BuyGoods'],
 (2, 12, 15): ['Work', 'HealthCare', 'DropOff'],
 (2, 3, 15): ['Work', 'School', 'DropOff'],
 (2, 14, 15): ['Work', 'SomethingElse', 'DropOff'],
 (9, 10, 15): ['Recreation', 'Exercise', 'DropOff'],
 (2, 9, 10): ['Work', 'Recreation', 'Exercise'],
 (2, 6, 14, 15): ['Work', 'Services', 'SomethingElse', 'DropOff'],
 (7,): ['EatOut'],
 (2, 5, 7, 15): ['Work', 'BuyGoods', 'EatOut', 'DropOff'],
 (10,): ['Exercise'],
 (13,): ['Religious'],
 (14, 15): ['SomethingElse', 'DropOff'],
 (2, 7): ['Work', 'EatOut'],
 (0, 2, 15): ['Transportation', 'Work', 'DropOff'],
 (1, 11, 15): ['Home', 'Visit', 'DropOff'],
 (2, 9, 15

In [14]:
# Get the number of unique poi_id for name 'residence'
poi_names[poi_names['name'] == 'residence']
# PRINT THE NUMBER OF UNIQUE POI_ID FOR NAME 'RESIDENCE'
poi_names[poi_names['name'] == 'residence'].shape[0]

514177

In [15]:
name_encoder = LabelEncoder()
train_poi_df['name_encoded'] = name_encoder.fit_transform(train_poi_df['name'])

agent_encoder = LabelEncoder()
train_poi_df['agent_id_encoded'] = agent_encoder.fit_transform(train_poi_df['agent_id'])

poi_encoder = LabelEncoder()
train_poi_df['poi_id_encoded'] = poi_encoder.fit_transform(train_poi_df['poi_id'])

In [16]:
train_poi_df

Unnamed: 0,agent_id,poi_id,start_datetime,end_datetime,name,latitude,longitude,act_types,node_idx,name_encoded,agent_id_encoded,poi_id_encoded
0,1,1518791,2024-01-01 00:00:00-08:00,2024-01-01 11:36:59-08:00,residence,34.041928,-118.338327,"[1, 11, 15]",0,236159,0,501237
1,1,1518791,2024-01-01 12:48:23-08:00,2024-01-01 13:06:35-08:00,residence,34.041928,-118.338327,"[1, 11, 15]",1,236159,0,501237
2,1,1518791,2024-01-01 14:21:28-08:00,2024-01-01 15:22:44-08:00,residence,34.041928,-118.338327,"[1, 11, 15]",2,236159,0,501237
3,1,1518791,2024-01-01 17:32:54-08:00,2024-01-02 09:31:12-08:00,residence,34.041928,-118.338327,"[1, 11, 15]",3,236159,0,501237
4,1,1518791,2024-01-02 13:51:05-08:00,2024-01-03 11:25:38-08:00,residence,34.041928,-118.338327,"[1, 11, 15]",4,236159,0,501237
...,...,...,...,...,...,...,...,...,...,...,...,...
17261433,200000,2885338,2024-01-24 16:22:26-08:00,2024-01-24 19:28:16-08:00,residence,33.790039,-117.846742,"[1, 11, 15]",17261433,236159,199999,791608
17261434,200000,2885338,2024-01-24 21:06:29-08:00,2024-01-25 06:27:41-08:00,residence,33.790039,-117.846742,"[1, 11, 15]",17261434,236159,199999,791608
17261435,200000,2885338,2024-01-25 15:18:47-08:00,2024-01-26 06:53:26-08:00,residence,33.790039,-117.846742,"[1, 11, 15]",17261435,236159,199999,791608
17261436,200000,2885338,2024-01-26 17:56:49-08:00,2024-01-27 08:08:48-08:00,residence,33.790039,-117.846742,"[1, 11, 15]",17261436,236159,199999,791608


In [17]:
train_poi_df['duration'] = (train_poi_df['end_datetime'] - train_poi_df['start_datetime']).dt.total_seconds()
# Convert duration from seconds to hours
train_poi_df['duration_hour'] = (train_poi_df['duration'] / 3600).astype(int)
lat_mean = train_poi_df['latitude'].mean()
lat_std = train_poi_df['latitude'].std()
lon_mean = train_poi_df['longitude'].mean()
lon_std = train_poi_df['longitude'].std()
duration_mean = train_poi_df['duration_hour'].mean()
duration_std = train_poi_df['duration_hour'].std()  

In [18]:
#data = train_poi_df[:1000].copy()
data=train_poi_df.copy()

In [19]:
# for all unique act_types, create a crresponding hashmap
act_type_to_id = {tuple(sorted(act_type)): idx for idx, act_type in enumerate(unique_act_types)}
print(act_type_to_id)


{(2, 4, 15): 0, (0, 2, 9, 15): 1, (2,): 2, (2, 7, 10, 15): 3, (2, 8, 15): 4, (14,): 5, (2, 10, 15): 6, (2, 5, 7): 7, (2, 5): 8, (2, 12, 15): 9, (2, 3, 15): 10, (2, 14, 15): 11, (9, 10, 15): 12, (2, 9, 10): 13, (2, 6, 14, 15): 14, (7,): 15, (2, 5, 7, 15): 16, (10,): 17, (13,): 18, (14, 15): 19, (2, 7): 20, (0, 2, 15): 21, (1, 11, 15): 22, (2, 9, 15): 23, (9, 15): 24, (2, 9): 25, (2, 6, 15): 26, (2, 15): 27}


In [20]:
# Create a new column 'act_type_id' in the DataFrame
data['act_type_id'] = data['act_types'].apply(lambda x: act_type_to_id[tuple(sorted(x))])

In [21]:
data

Unnamed: 0,agent_id,poi_id,start_datetime,end_datetime,name,latitude,longitude,act_types,node_idx,name_encoded,agent_id_encoded,poi_id_encoded,duration,duration_hour,act_type_id
0,1,1518791,2024-01-01 00:00:00-08:00,2024-01-01 11:36:59-08:00,residence,34.041928,-118.338327,"[1, 11, 15]",0,236159,0,501237,41819.0,11,22
1,1,1518791,2024-01-01 12:48:23-08:00,2024-01-01 13:06:35-08:00,residence,34.041928,-118.338327,"[1, 11, 15]",1,236159,0,501237,1092.0,0,22
2,1,1518791,2024-01-01 14:21:28-08:00,2024-01-01 15:22:44-08:00,residence,34.041928,-118.338327,"[1, 11, 15]",2,236159,0,501237,3676.0,1,22
3,1,1518791,2024-01-01 17:32:54-08:00,2024-01-02 09:31:12-08:00,residence,34.041928,-118.338327,"[1, 11, 15]",3,236159,0,501237,57498.0,15,22
4,1,1518791,2024-01-02 13:51:05-08:00,2024-01-03 11:25:38-08:00,residence,34.041928,-118.338327,"[1, 11, 15]",4,236159,0,501237,77673.0,21,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17261433,200000,2885338,2024-01-24 16:22:26-08:00,2024-01-24 19:28:16-08:00,residence,33.790039,-117.846742,"[1, 11, 15]",17261433,236159,199999,791608,11150.0,3,22
17261434,200000,2885338,2024-01-24 21:06:29-08:00,2024-01-25 06:27:41-08:00,residence,33.790039,-117.846742,"[1, 11, 15]",17261434,236159,199999,791608,33672.0,9,22
17261435,200000,2885338,2024-01-25 15:18:47-08:00,2024-01-26 06:53:26-08:00,residence,33.790039,-117.846742,"[1, 11, 15]",17261435,236159,199999,791608,56079.0,15,22
17261436,200000,2885338,2024-01-26 17:56:49-08:00,2024-01-27 08:08:48-08:00,residence,33.790039,-117.846742,"[1, 11, 15]",17261436,236159,199999,791608,51119.0,14,22


## MAKING MTM DATA

In [22]:
# # put agent id, poi id, latitude, longitude in observations

# # get all the unique agent ids
# unique_agent_ids = train_poi_df['agent_id'].unique()
# # for each unique agent id, get all corresponding poi ids, names, latitudes, longitudes and store it in the format of episode[observations]=shape (1001, 4), dtype float32

# # Initialize the observation dictionary
# observations = {}

# # Get unique agents
# agents = data['agent_id'].unique()

# # Iterate over each agent
# for agent_id in agents:
#     agent_data = data[data['agent_id'] == agent_id]
#     agent_id_append = np.full((len(agent_data), 1), agent_id)
    
#     # Extract POI IDs, names, latitudes, and longitudes
#     #poi_ids = agent_data['poi_id'].values
#     latitudes = agent_data['latitude'].values
#     longitudes = agent_data['longitude'].values
    
#     # get start and end datetimes
#     start_datetimes = agent_data['start_datetime'].values
#     end_datetimes = agent_data['end_datetime'].values

#     # get if weekday or weekend
#     is_weekday = agent_data['start_datetime'].dt.dayofweek < 5
#     is_weekday = is_weekday.astype(int).values.reshape(-1, 1)

#     # get the day of the week, time of day, minute of the hour for start and end datetimes
#     start_hours = agent_data['start_datetime'].dt.hour.values.reshape(-1, 1)
#     start_minutes = agent_data['start_datetime'].dt.minute.values.reshape(-1, 1)
#     end_hours = agent_data['end_datetime'].dt.hour.values.reshape(-1, 1)
#     end_minutes = agent_data['end_datetime'].dt.minute.values.reshape(-1, 1)
#     start_days = agent_data['start_datetime'].dt.dayofweek.values.reshape(-1, 1)
#     end_days = agent_data['end_datetime'].dt.dayofweek.values.reshape(-1, 1)

#     # Combine the extracted data into a single arrqaay
#     combined_data = np.column_stack((agent_id_append, is_weekday, start_days, end_days,
#                                       start_hours, start_minutes, end_hours, end_minutes))
        
#     # Store the combined data in the observations dictionary
#     observations[agent_id] = combined_data

# # # Print the observations
# # for agent_id, obs in observations.items():
# #     print(f"Agent ID: {agent_id}")
# #     print(f"Observations: shape {obs.shape}, dtype {obs.dtype}")
# #     print(obs)

In [None]:
# Initialize the observation dictionary
actions = {}

# Get unique agents
agents = data['agent_id'].unique()

# Group data by agent_id for more efficient processing
for agent_id, agent_data in tqdm(data.groupby('agent_id')):
    # Create agent_id column
    agent_id_append = np.full((len(agent_data), 1), agent_id)
    
    # Extract latitudes and longitudes
    latitudes = agent_data['latitude'].values.reshape(-1, 1)
    longitudes = agent_data['longitude'].values.reshape(-1, 1)
    
    # Get time-related features
    is_weekday = (agent_data['start_datetime'].dt.dayofweek < 5).astype(int).values.reshape(-1, 1)
    
    start_days = agent_data['start_datetime'].dt.dayofweek.values.reshape(-1, 1)
    end_days = agent_data['end_datetime'].dt.dayofweek.values.reshape(-1, 1)
    
    start_hours = agent_data['start_datetime'].dt.hour.values.reshape(-1, 1)
    start_minutes = agent_data['start_datetime'].dt.minute.values.reshape(-1, 1)
    
    end_hours = agent_data['end_datetime'].dt.hour.values.reshape(-1, 1)
    end_minutes = agent_data['end_datetime'].dt.minute.values.reshape(-1, 1)
    
    # Normalize spatial features (latitude and longitude)
    normalized_lat = ((latitudes - lat_mean) / lat_std).reshape(-1, 1)
    normalized_lon = ((longitudes - lon_mean) / lon_std).reshape(-1, 1)
    
    # Add duration feature
    normalized_duration = ((agent_data['duration_hour'].values - duration_mean) / duration_std).reshape(-1, 1)
    
    # Combine all features into a single array
    combined_data = np.hstack((
        agent_id_append, is_weekday, 
        start_days, end_days,
        start_hours, start_minutes, 
        end_hours, end_minutes,
        normalized_lat, normalized_lon,
        normalized_duration
    ))
    
    # Store the combined data
    actions[agent_id] = combined_data


 26%|██▋       | 52779/200000 [01:11<03:11, 769.43it/s]

In [None]:
# Get unique agents
agents = data['agent_id'].unique()

In [None]:
# Create actions using pandas groupby and apply for better performance
observations = {}

# Group data by agent_id and extract act_type_id column
grouped = data.groupby('agent_id')

# This is more efficient than looping through each agent
for agent_id, group in tqdm(grouped):
    observations[agent_id] = group['act_type_id'].values.reshape(-1, 1)

100%|██████████| 200000/200000 [00:13<00:00, 14296.86it/s]


In [None]:
actions

{1: array([[22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [22],
        [ 8],
        [ 8],
        [ 8],
        [ 8],
        [ 8],
        [ 8],
        [ 8],
        [ 8],
        [ 8],
        [ 8],
        [ 8],
        [ 8],
        [ 8],
        [23],
   

In [33]:
# # Iterate over each agent
# for agent_id in tqdm(agents):
#     agent_data = data[data['agent_id'] == agent_id]
    
#     # get activity types for actions
#     actions[agent_id] = agent_data['act_type_id']

In [34]:




# # Print the actions
# for agent_id, act in actions.items():
#     print(f"Agent ID: {agent_id}")
#     print(f"Actions: shape {act.shape}, dtype {act.dtype}")
#     print(act)


In [None]:
# # Get the length of the longest trajectory
# max_length = max(len(obs) for obs in observations.values())
# max_length

221

In [None]:
# # Pad the observations and actions to the maximum length, for the padded regions, use zeros and create attention masks for padded regions
# # Initialize dictionaries to store padded observations and actions
# padded_observations = {}
# padded_actions = {}

# # Initialize dictionaries to store attention masks
# attention_masks = {}

# # Iterate over each agent
# for agent_id in agents:
#     # Get the observations and actions for the current agent
#     obs = observations[agent_id]
#     act = actions[agent_id]
    
#     # Get the length of the current trajectory
#     length = len(obs)
    
#     # Pad the observations and actions to the maximum length
#     padded_obs = np.zeros((max_length, obs.shape[1]), dtype=np.float32)
#     padded_act = np.zeros((max_length, act.shape[1]), dtype=np.float32)
    
#     # Create an attention mask for the padded regions
#     mask = np.zeros(max_length, dtype=np.float32)
    
#     # Fill the padded observations and actions
#     padded_obs[:length] = obs
#     padded_act[:length] = act
#     mask[:length] = 1.0
    
#     # Store the padded observations, actions, and attention masks
#     padded_observations[agent_id] = padded_obs
#     padded_actions[agent_id] = padded_act
#     attention_masks[agent_id] = mask


In [37]:
# CHECK padded observations and actions and attention masks are all same shapes for all agents
# for agent_id, obs in padded_observations.items():
#     print(f"Agent ID: {agent_id}")
#     print(f"Padded Observations: shape {obs.shape}, dtype {obs.dtype}")
#     print(f"Padded Actions: shape {act.shape}, dtype {act.dtype}")
#     print(f"Attention Mask: shape {mask.shape}, dtype {mask.dtype}")

In [None]:
# attention_masks[1]

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

In [None]:
# save the observations and actions in the buffer npz format
# Save the observations and actions
sub_dir='obs28_act1'
save_dir = f'/data/home/umang/Trajectory_project/anomaly_traj_data/haystac_anomaly_data1/saved_agent_episodes_new_chand_swapped/{sub_dir}'
import os
os.makedirs(save_dir, exist_ok=True)

In [43]:
# Combinedly store the observations and actions as 1 agent episode in the buffer npz format
for agent_id in agents:
    obs = observations[agent_id]
    act = actions[agent_id]
    att_mask = attention_masks[agent_id]
    np.savez(f'{save_dir}/agent_{agent_id}.npz', obs=obs, act=act, att_mask=att_mask)


In [44]:
# read the npz file
# Load the observations and actions
agent_id = 1
data_folder = f'{save_dir}/agent_{agent_id}.npz'
loaded_data = np.load(data_folder)
obs = loaded_data['obs']
act = loaded_data['act']
att_mask = loaded_data['att_mask']

print(f"Agent ID: {agent_id}")
print(f"Observations: shape {obs.shape}, dtype {obs.dtype}")
print(obs)
print(f"Actions: shape {act.shape}, dtype {act.dtype}")
print(act)
print(f"Attention Masks: shape {att_mask.shape}, dtype {att_mask.dtype}")
print(att_mask)

Agent ID: 1
Observations: shape (99, 11), dtype float64
[[ 1.          1.          0.         ...  0.0312492  -0.68595467
   0.51897599]
 [ 1.          1.          0.         ...  0.0312492  -0.68595467
  -0.92767252]
 [ 1.          1.          0.         ...  0.0312492  -0.68595467
  -0.79615902]
 ...
 [ 1.          1.          4.         ...  0.25630346 -0.51817141
  -0.53313202]
 [ 1.          0.          5.         ...  0.22859156 -0.37833994
  -0.79615902]
 [ 1.          0.          5.         ... -0.01180029 -0.28965514
  -0.92767252]]
Actions: shape (99, 1), dtype int64
[[22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [22]
 [ 8]
 [ 8]
 [ 8]
 [ 8]
 [ 8]
 [ 8]
 [ 8]
 [ 8]
 [ 8]
 [ 8]
 [ 8]
 [ 8]
 [

In [46]:
# # max and min for each dimension of the node features
# max_features = G[0].ndata['attr'].max(dim=0)[0]
# min_features = G[0].ndata['attr'].min(dim=0)[0]

In [47]:
# max_features

NameError: name 'max_features' is not defined

In [None]:
min_features

In [None]:
# pick 3 random words from word2vec and get their embeddings    
import random
random_words = random.sample(list(reduced_embeddings.keys()), 3)
print(random_words)
# get the embeddings
random_embeddings = [reduced_embeddings[word] for word in random_words]

# check the distance between all 3 embeddings using euclidean distance
#from scipy.spatial.distance import cosine
distances = []
for i in range(3):
    for j in range(i+1, 3):
        distances.append(np.linalg.norm(random_embeddings[i] - random_embeddings[j]))
        #distances.append(cosine(random_embeddings[i], random_embeddings[j]))
print(distances)



In [None]:
# check which word in the dictionary is closest to the list of random embeddings
min_dist = 100
min_word = ''
for rand_embedding in random_embeddings:
    print(rand_embedding)
for rand_embedding in random_embeddings:
    rand_embedding_tensor = torch.tensor(rand_embedding)
    for word in reduced_embeddings.keys():
        embedding_tensor = torch.tensor(reduced_embeddings[word])

        # use cosine similarity to get the distance between the embeddings
        dist = torch.nn.functional.cosine_similarity(embedding_tensor, rand_embedding_tensor, dim=0)
        if dist < min_dist:
            min_dist = dist
            min_word = word
    print(min_dist, min_word)


In [None]:
from annoy import AnnoyIndex

# Prepare data
keys = list(embedding_tensor.keys())
dim = len(embedding_tensor[keys[0]])
index = AnnoyIndex(dim, 'euclidean')

for i, key in enumerate(keys):
    index.add_item(i, embedding_tensor[key])

# Build the index
index.build(10)

# Perform search
nearest_idx = index.get_nns_by_vector(search_vector, 1)[0]
closest_key = keys[nearest_idx]

print("Closest Vector Key:", closest_key)

In [None]:
# Get the number of files in the saved directory
import os
sub_dir='obs4_act1'
save_dir = f'/data/home/umang/Trajectory_project/anomaly_traj_data/haystac_anomaly_data1/saved_agent_episodes/{sub_dir}'
len(os.listdir(save_dir))

In [None]:
# Read the saved npz files
# get the file names with minimum and maximum rows (Stop points) in the observations
import numpy as np
min_rows = float('inf')
max_rows = 0
min_file = ''
max_file = ''
for file in os.listdir(save_dir):
    data = np.load(f'{save_dir}/{file}')
    obs = data['obs']
    if obs.shape[0] < min_rows:
        min_rows = obs.shape[0]
        min_file = file
    if obs.shape[0] > max_rows:
        max_rows = obs.shape[0]
        max_file = file

In [None]:
# print the file names with minimum and maximum rows
print(f"File with minimum rows: {min_file}, rows: {min_rows}")
print(f"File with maximum rows: {max_file}, rows: {max_rows}")

In [48]:
sample_saved_npz_path = "/data/home/umang/Trajectory_project/anomaly_traj_data/haystac_anomaly_data1/saved_agent_episodes_new_chand/obs7_act1/agent_100000.npz"
# Load the sample npz file
sample_data = np.load(sample_saved_npz_path)
sample_obs = sample_data['obs']
sample_act = sample_data['act']
sample_att_mask = sample_data['att_mask']

In [53]:
sample_obs

array([[ 1.00000000e+05,  0.00000000e+00,  5.00000000e+00,
         5.00000000e+00,  6.00000000e+00,  0.00000000e+00,
         8.00000000e+00,  7.00000000e+00, -1.12820037e+00,
         1.77360630e+00, -6.64645520e-01],
       [ 1.00000000e+05,  1.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  7.00000000e+00,  5.40000000e+01,
         1.60000000e+01,  5.50000000e+01, -5.71557088e-01,
         1.23022855e+00,  2.55948988e-01],
       [ 1.00000000e+05,  1.00000000e+00,  1.00000000e+00,
         1.00000000e+00,  1.30000000e+01,  3.60000000e+01,
         1.40000000e+01,  2.50000000e+01, -5.71557088e-01,
         1.23022855e+00, -9.27672522e-01],
       [ 1.00000000e+05,  1.00000000e+00,  2.00000000e+00,
         2.00000000e+00,  7.00000000e+00,  1.10000000e+01,
         1.60000000e+01,  2.80000000e+01, -5.71557088e-01,
         1.23022855e+00,  2.55948988e-01],
       [ 1.00000000e+05,  1.00000000e+00,  3.00000000e+00,
         3.00000000e+00,  1.30000000e+01,  4.90000000e+01,
  

In [52]:
sample_act

array([[ 3],
       [16],
       [16],
       [16],
       [16],
       [16],
       [16],
       [16],
       [16],
       [16],
       [16],
       [16],
       [16],
       [16],
       [16],
       [16],
       [16],
       [16],
       [16],
       [16],
       [16],
       [16],
       [16],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [22],
       [20]])