In [4]:
import pandas as pd
# import dgl
import torch
import numpy as np
import math
# Encode categorical features
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

In [5]:
data_folder = '/data/home/umang/Trajectory_project/anomaly_traj_data/numosim/'
train_file_name = 'stay_points_train.parquet'
poi_filename = 'poi.parquet'
train_df = pd.read_parquet(f'{data_folder}/{train_file_name}')
poi_df = pd.read_parquet(f'{data_folder}/{poi_filename}')
train_poi_df = pd.merge(train_df, poi_df, on='poi_id')

In [6]:
# Convert all column names to lowercase
train_df.columns = train_df.columns.str.lower()

# Display the DataFrame to verify the changes
train_df.head()

Unnamed: 0,agent_id,poi_id,start_datetime,end_datetime
0,1,1518791,2024-01-01 00:00:00-08:00,2024-01-01 11:36:59-08:00
1,1,1912553,2024-01-01 11:51:32-08:00,2024-01-01 12:33:47-08:00
2,1,1518791,2024-01-01 12:48:23-08:00,2024-01-01 13:06:35-08:00
3,1,103611,2024-01-01 13:22:50-08:00,2024-01-01 14:05:50-08:00
4,1,1518791,2024-01-01 14:21:28-08:00,2024-01-01 15:22:44-08:00


In [7]:
train_df

Unnamed: 0,agent_id,poi_id,start_datetime,end_datetime
0,1,1518791,2024-01-01 00:00:00-08:00,2024-01-01 11:36:59-08:00
1,1,1912553,2024-01-01 11:51:32-08:00,2024-01-01 12:33:47-08:00
2,1,1518791,2024-01-01 12:48:23-08:00,2024-01-01 13:06:35-08:00
3,1,103611,2024-01-01 13:22:50-08:00,2024-01-01 14:05:50-08:00
4,1,1518791,2024-01-01 14:21:28-08:00,2024-01-01 15:22:44-08:00
...,...,...,...,...
17261433,200000,329705,2024-01-27 08:24:08-08:00,2024-01-27 09:24:42-08:00
17261434,200000,408965,2024-01-27 09:38:34-08:00,2024-01-27 10:19:04-08:00
17261435,200000,402449,2024-01-27 10:30:15-08:00,2024-01-27 13:07:01-08:00
17261436,200000,337944,2024-01-27 13:21:36-08:00,2024-01-27 14:09:46-08:00


In [8]:
train_poi_df.head()
# get the number of unique values in each column
cat_cols = ['latitude', 'longitude', 'poi_id', 'name']
cat_dims = [train_poi_df[col].nunique() for col in cat_cols]

In [9]:
cat_dims

[686406, 686408, 800485, 237362]

In [10]:
#Create a unique node index for each visit
train_poi_df = train_poi_df.reset_index(drop=True)
train_poi_df['node_idx'] = train_poi_df.index

In [11]:
train_poi_df

Unnamed: 0,agent_id,poi_id,start_datetime,end_datetime,name,latitude,longitude,act_types,node_idx
0,1,1518791,2024-01-01 00:00:00-08:00,2024-01-01 11:36:59-08:00,residence,34.041928,-118.338327,"[1, 11, 15]",0
1,1,1518791,2024-01-01 12:48:23-08:00,2024-01-01 13:06:35-08:00,residence,34.041928,-118.338327,"[1, 11, 15]",1
2,1,1518791,2024-01-01 14:21:28-08:00,2024-01-01 15:22:44-08:00,residence,34.041928,-118.338327,"[1, 11, 15]",2
3,1,1518791,2024-01-01 17:32:54-08:00,2024-01-02 09:31:12-08:00,residence,34.041928,-118.338327,"[1, 11, 15]",3
4,1,1518791,2024-01-02 13:51:05-08:00,2024-01-03 11:25:38-08:00,residence,34.041928,-118.338327,"[1, 11, 15]",4
...,...,...,...,...,...,...,...,...,...
17261433,200000,2885338,2024-01-24 16:22:26-08:00,2024-01-24 19:28:16-08:00,residence,33.790039,-117.846742,"[1, 11, 15]",17261433
17261434,200000,2885338,2024-01-24 21:06:29-08:00,2024-01-25 06:27:41-08:00,residence,33.790039,-117.846742,"[1, 11, 15]",17261434
17261435,200000,2885338,2024-01-25 15:18:47-08:00,2024-01-26 06:53:26-08:00,residence,33.790039,-117.846742,"[1, 11, 15]",17261435
17261436,200000,2885338,2024-01-26 17:56:49-08:00,2024-01-27 08:08:48-08:00,residence,33.790039,-117.846742,"[1, 11, 15]",17261436


In [12]:
#Print corresponding names for each unique poi_id
poi_names = train_poi_df[['poi_id', 'name']].drop_duplicates()
poi_names



Unnamed: 0,poi_id,name
0,1518791,residence
1752,1912553,residence
2023,103611,Chevron
2324,247124,Roving Karaoke Studio
2418,251512,Dixie Hollywood
...,...,...
17261289,393516,Hastings Oaks Club House
17261290,858075,residence
17261355,156776,Petrozone gas
17261356,758742,residence


In [13]:
# get the number of poi_ids for each unique 'name'
poi_name_counts = train_poi_df.groupby('name')['poi_id'].nunique()

# print the number of poi_ids for each unique 'name' descending
poi_name_counts = poi_name_counts.sort_values(ascending=False)
poi_name_counts

name
residence                                514177
Starbucks                                   564
7-Eleven                                    528
Metabank                                    493
ATM                                         461
                                          ...  
Garden Caf at Pilgrim Place                   1
Garden Cafe at Norton Simon Museum            1
Garden Center                                 1
Garden Chapel (Faith Lutheran Church)         1
                                           1
Name: poi_id, Length: 237362, dtype: int64

In [14]:
# Take differnt entries with name "Starbucks" which have different poi_ids
x= train_poi_df.groupby('name').get_group('Starbucks')

# get unique poi_ids for x and print corresponding entries
x.groupby('poi_id').head(1)

Unnamed: 0,agent_id,poi_id,start_datetime,end_datetime,name,latitude,longitude,act_types,node_idx
10759,11,324157,2024-01-14 20:06:10-08:00,2024-01-14 20:56:53-08:00,Starbucks,34.150546,-118.073148,"[2, 7]",10759
65179,110,325737,2024-01-02 09:10:46-08:00,2024-01-02 10:24:06-08:00,Starbucks,34.125039,-118.057616,"[2, 7]",65179
93927,163,149511,2024-01-23 11:01:51-08:00,2024-01-23 13:23:03-08:00,Starbucks,33.987014,-118.225554,"[2, 7]",93927
109948,193,209026,2024-01-07 18:18:11-08:00,2024-01-07 20:18:19-08:00,Starbucks,34.308749,-118.431488,"[2, 7]",109948
214880,386,356742,2024-01-05 18:16:52-08:00,2024-01-05 19:35:53-08:00,Starbucks,33.902705,-118.055044,"[2, 7]",214880
...,...,...,...,...,...,...,...,...,...
17022681,193678,22641,2024-01-05 11:57:35-08:00,2024-01-05 13:24:42-08:00,Starbucks,33.759034,-117.989295,"[2, 7]",17022681
17034805,193996,276994,2024-01-03 13:06:37-08:00,2024-01-03 13:11:37-08:00,Starbucks,34.234422,-118.255933,"[2, 7]",17034805
17116313,196159,395865,2024-01-03 18:33:24-08:00,2024-01-03 18:38:24-08:00,Starbucks,34.170039,-118.111773,"[2, 7]",17116313
17126662,196434,418176,2024-01-01 16:29:41-08:00,2024-01-01 17:32:12-08:00,Starbucks,34.581061,-118.141162,"[2, 7]",17126662


In [15]:
#get all unique act_Types for train_poi_df
act_types = train_poi_df['act_types']
# Flatten and get unique values
unique_act_types = set(tuple(sorted(sublist)) for sublist in act_types)
unique_act_types

{(0, 2, 9, 15),
 (0, 2, 15),
 (1, 11, 15),
 (2,),
 (2, 3, 15),
 (2, 4, 15),
 (2, 5),
 (2, 5, 7),
 (2, 5, 7, 15),
 (2, 6, 14, 15),
 (2, 6, 15),
 (2, 7),
 (2, 7, 10, 15),
 (2, 8, 15),
 (2, 9),
 (2, 9, 10),
 (2, 9, 15),
 (2, 10, 15),
 (2, 12, 15),
 (2, 14, 15),
 (2, 15),
 (7,),
 (9, 10, 15),
 (9, 15),
 (10,),
 (13,),
 (14,),
 (14, 15)}

In [16]:
# For each unique_act_types, get possible unique name 
act_type_name = {}
for act_type in unique_act_types:
    act_type_name[act_type] = train_poi_df[train_poi_df['act_types'].apply(lambda x: set(x) == set(act_type))]['name'].unique()
act_type_name


KeyboardInterrupt: 

In [17]:
# #- Transportation = 0
# - 1 = Home
# - 2 = Work
# - 3 = School
# - 4 = ChildCare
# - 5 = BuyGoods
# - 6 = Services
# - 7 = EatOut
# - 8 = Errands
# - 9 = Recreation
# - 10 = Exercise
# - 11 = Visit
# - 12 = HealthCare
# - 13 = Religious
# - 14 = SomethingElse
# - 15 = DropOff

# Each of the element in 1 act type is one of these numbers. Each number has a corresponding category name.
# For example, 1 corresponds to 'Home', 2 corresponds to 'Work' and so on.

# Map unique act types to corresponding category names
list_of_categories = ['Transportation', 'Home', 'Work', 'School', 'ChildCare', 'BuyGoods', 'Services', 'EatOut', 'Errands', 'Recreation', 'Exercise', 'Visit', 'HealthCare', 'Religious', 'SomethingElse', 'DropOff']
# Instead of numbers replace it with cateogry names for unique act types
act_type_category = {}

for act_type in unique_act_types:
    act_type_category[act_type] = [list_of_categories[i] for i in act_type]

# 
act_type_category

{(2, 4, 15): ['Work', 'ChildCare', 'DropOff'],
 (0, 2, 9, 15): ['Transportation', 'Work', 'Recreation', 'DropOff'],
 (2,): ['Work'],
 (2, 7, 10, 15): ['Work', 'EatOut', 'Exercise', 'DropOff'],
 (2, 8, 15): ['Work', 'Errands', 'DropOff'],
 (14,): ['SomethingElse'],
 (2, 10, 15): ['Work', 'Exercise', 'DropOff'],
 (2, 5, 7): ['Work', 'BuyGoods', 'EatOut'],
 (2, 5): ['Work', 'BuyGoods'],
 (2, 12, 15): ['Work', 'HealthCare', 'DropOff'],
 (2, 3, 15): ['Work', 'School', 'DropOff'],
 (2, 14, 15): ['Work', 'SomethingElse', 'DropOff'],
 (9, 10, 15): ['Recreation', 'Exercise', 'DropOff'],
 (2, 9, 10): ['Work', 'Recreation', 'Exercise'],
 (2, 6, 14, 15): ['Work', 'Services', 'SomethingElse', 'DropOff'],
 (7,): ['EatOut'],
 (2, 5, 7, 15): ['Work', 'BuyGoods', 'EatOut', 'DropOff'],
 (10,): ['Exercise'],
 (13,): ['Religious'],
 (14, 15): ['SomethingElse', 'DropOff'],
 (2, 7): ['Work', 'EatOut'],
 (0, 2, 15): ['Transportation', 'Work', 'DropOff'],
 (1, 11, 15): ['Home', 'Visit', 'DropOff'],
 (2, 9, 15

In [18]:
# For the corresponding act type name, fetch the corresponding category name from act_type_category
# display the number of associated names with each category

for act_type in unique_act_types:
    print("act type category: ", act_type_category[act_type])
    print("number of associated poi names: ", len(act_type_name[act_type]))

act type category:  ['Work', 'ChildCare', 'DropOff']
number of associated poi names:  13977
act type category:  ['Transportation', 'Work', 'Recreation', 'DropOff']
number of associated poi names:  272
act type category:  ['Work']
number of associated poi names:  7504
act type category:  ['Work', 'EatOut', 'Exercise', 'DropOff']
number of associated poi names:  15606
act type category:  ['Work', 'Errands', 'DropOff']
number of associated poi names:  4171
act type category:  ['SomethingElse']
number of associated poi names:  170
act type category:  ['Work', 'Exercise', 'DropOff']


KeyError: (2, 10, 15)

In [19]:
# Create a one hot encoding for act types

In [20]:
# Get the number of unique poi_id for name 'residence'
poi_names[poi_names['name'] == 'residence']
# PRINT THE NUMBER OF UNIQUE POI_ID FOR NAME 'RESIDENCE'
poi_names[poi_names['name'] == 'residence'].shape[0]

514177

## whole dataset level encoding

In [21]:
#import gensim.downloader as api
# Download and load the model
# load a 100 dimension word2vec model 
#word2vec_model = api.load("glove-wiki-gigaword-100")

In [None]:
# Tokenize the poi names
tokenized_names = [name.lower().split() for name in train_poi_df['name']]

In [None]:
# train a 32 dimension word2vec model on train_poi_df['name']
# train the model on the tokenized names
from gensim.models import Word2Vec

# if name_word2vec.model exists load it
try:
    model = Word2Vec.load("name_word2vec.model")
    print("Loaded model")
except:      
    # Train Word2Vec model
    model = Word2Vec(
        tokenized_names,
        vector_size=32,  # Dimensionality of word vectors
        window=3,        # ontext window size
        min_count=1,     # Minimum count of occurrences for words
        workers=4,       # Number of threads for training
        sg=1             # Skip-gram model (set sg=0 for CBOW)
    )


    model.save("name_word2vec.model")

### Reduce Dimension of word2vec from 300 to 32

In [None]:
# from sklearn.decomposition import PCA
# import numpy as np

# # Extract word vectors from the Word2Vec model
# word_vectors = word2vec_model.vectors

# # Fit PCA to reduce dimensions to 32
# pca = PCA(n_components=32)
# reduced_word_vectors = pca.fit_transform(word_vectors)

# # Create a dictionary to map words to their reduced embeddings
# reduced_embeddings = {word: reduced_word_vectors[idx] for word, idx in word2vec_model.key_to_index.items()}

In [None]:
# import os

# save_dir='./saved_embeddings'
# # Create the directory if it does not exist
# if not os.path.exists(save_dir):
#    os.makedirs(save_dir)

# # save the reduced dimension dictionary
# import pickle
# with open(f'{save_dir}/reduced_embeddings.pkl', 'wb') as f:
#    pickle.dump(reduced_embeddings, f)

In [None]:
# # load the reduced embeddings pkl file
# import pickle
# save_dir='./saved_embeddings'
# with open(f'{save_dir}/reduced_embeddings.pkl', 'rb') as f:
#     reduced_embeddings = pickle.load(f)

In [None]:
# use the reduced embeddings to get the embedding for each name
def get_embedding_reduced(name):
    tokens = name.lower().split()
    embeddings = []
    for token in tokens:
        if token in reduced_embeddings:
            embeddings.append(reduced_embeddings[token])
        else:
            # Handle unknown tokens (e.g., use a zero vector or random vector)
            embeddings.append(np.zeros(32))
    # Average embeddings if multiple tokens
    embedding = np.mean(embeddings, axis=0)
    return embedding

In [None]:
import numpy as np
from gensim.models import Word2Vec

# Load the Word2Vec model
word2vec_model = Word2Vec.load("name_word2vec.model")

# Create a dictionary to map words to their embeddings
reduced_embeddings = {word: word2vec_model.wv[word] for word in word2vec_model.wv.index_to_key}

def get_embedding(name):
    tokens = name.lower().split()
    embeddings = []
    for token in tokens:
        if token in word2vec_model.wv:
            embeddings.append(word2vec_model.wv[token])
        else:
            # Handle unknown tokens (e.g., use a zero vector or random vector)
            embeddings.append(np.zeros(word2vec_model.vector_size))
    # Average embeddings if multiple tokens
    embedding = np.mean(embeddings, axis=0)
    return embedding

# Example usage
name = "residence"
embedding = get_embedding(name)
print(embedding)

In [None]:
name_encoder = LabelEncoder()
train_poi_df['name_encoded'] = name_encoder.fit_transform(train_poi_df['name'])

agent_encoder = LabelEncoder()
train_poi_df['agent_id_encoded'] = agent_encoder.fit_transform(train_poi_df['agent_id'])

poi_encoder = LabelEncoder()
train_poi_df['poi_id_encoded'] = poi_encoder.fit_transform(train_poi_df['poi_id'])

In [None]:
train_poi_df

In [22]:
train_poi_df['duration'] = (train_poi_df['end_datetime'] - train_poi_df['start_datetime']).dt.total_seconds()
# Convert duration from seconds to hours
train_poi_df['duration_hour'] = (train_poi_df['duration'] / 3600).astype(int)
lat_mean = train_poi_df['latitude'].mean()
lat_std = train_poi_df['latitude'].std()
lon_mean = train_poi_df['longitude'].mean()
lon_std = train_poi_df['longitude'].std()
duration_mean = train_poi_df['duration_hour'].mean()
duration_std = train_poi_df['duration_hour'].std()  

In [23]:
#data = train_poi_df[:1000].copy()
data=train_poi_df.copy()

In [24]:
# apply get embedding function to the name column
data['name_embedding'] = data['name'].apply(lambda x: get_embedding(x))

NameError: name 'get_embedding' is not defined

In [None]:
data.head()

In [None]:

# # Encoding 'act_types' using multi-hot encoding
# unique_act_types = set()
# for acts in data['act_types']:
#     unique_act_types.update(acts)
# unique_act_types = list(unique_act_types)
# act_type_to_idx = {act_type: idx for idx, act_type in enumerate(unique_act_types)}
# act_type_features = torch.zeros((len(data), len(unique_act_types)))
# for idx, acts in enumerate(data['act_types']):
#     for act in acts:
#         act_idx = act_type_to_idx[act]
#         act_type_features[idx, act_idx] = 1  # Multi-hot encoding


### save agent level graph

In [None]:
# We'll extract hour of day and day of week, and use cyclical encoding for hours
def encode_time(dt_series):
    # hours = dt_series.dt.hour + dt_series.dt.minute / 60.0
    hours = dt_series.dt.hour
    hours_norm = hours / 24.0 * 2 * math.pi  # Normalize to [0, 2pi]
    hours_sin = np.sin(hours_norm)
    hours_cos = np.cos(hours_norm)
    
    day_of_week = dt_series.dt.dayofweek  # Monday=0, Sunday=6
    day_of_week_onehot = np.eye(7)[day_of_week]
    day_norm = 2 * math.pi * day_of_week / 7.0
    day_sin = np.sin(day_norm)
    day_cos = np.cos(day_norm)
    return hours_sin, hours_cos, day_sin, day_cos

def get_features_per_agent(agent_df, lat_mean, lat_std, lon_mean, lon_std, duration_mean, duration_std):
    agent_df['latitude_norm'] = (agent_df['latitude'] - lat_mean) / lat_std
    agent_df['longitude_norm'] = (agent_df['longitude'] - lon_mean) / lon_std
    # agent_df['duration_norm'] = (agent_df['duration_hour'] - duration_mean) / duration_std

    # Encode start times
    start_hours_sin, start_hours_cos, day_sin, day_cos = encode_time(agent_df['start_datetime'])
    # Encode end times
    end_hours_sin, end_hours_cos, ay_sin, day_cos = encode_time(agent_df['end_datetime'])
    start_time_features = torch.tensor(
        np.column_stack([start_hours_sin, start_hours_cos]),
        dtype=torch.float32
    )
    end_time_features = torch.tensor(
        np.column_stack([end_hours_sin, end_hours_cos]),
        dtype=torch.float32
    )

    # Prepare node features
    # Numerical features
    latitude = torch.tensor(agent_df['latitude_norm'].values, dtype=torch.float32).unsqueeze(1)
    longitude = torch.tensor(agent_df['longitude_norm'].values, dtype=torch.float32).unsqueeze(1)
    duration = torch.tensor(agent_df['duration_hour'].values, dtype=torch.float32).unsqueeze(1)

    # Categorical features as embeddings (here we'll just use the encoded values directly)
    name_encoded = torch.tensor(agent_df['name_encoded'].values, dtype=torch.float32)
    # name_embedding = torch.tensor(agent_df['name_embedding'], dtype=torch.float32)
    name_embedding_values = agent_df['name_embedding'].apply(lambda x: np.array(x, dtype=np.float32) if isinstance(x, list) else x).values
    name_embedding = torch.tensor(np.stack(name_embedding_values), dtype=torch.float32)

    agent_id_encoded = torch.tensor(agent_df['agent_id_encoded'].values, dtype=torch.float32)
    poi_id_encoded = torch.tensor(agent_df['poi_id_encoded'].values, dtype=torch.float32)
    # Combine all node features
    agent_node_features = torch.cat([
        latitude,
        longitude,
        name_embedding,
        poi_id_encoded.unsqueeze(1).float(),
        start_time_features,
        duration,
    ], dim=1)

    return agent_node_features

In [None]:
import os
save_folder_name = 'mtm_all_features_stop_point_graphs_test'
save_dir = f'/data/home/umang/Trajectory_project/anomaly_traj_data/haystac_anomaly_data1/saved_graphs/{save_folder_name}'

if not os.path.exists(save_dir):
    os.makedirs(save_dir)
# save graph agent by agent
agents = data['agent_id'].unique()

# for agent_id in [1]:
for agent_id in tqdm(agents):
    edge_src = []
    edge_dst = []
    edge_time_diff = []
    agent_data = data[data['agent_id'] == agent_id].sort_values('start_datetime')
    
    # Get all features first
    agent_node_features = get_features_per_agent(agent_data, lat_mean, lat_std, lon_mean, lon_std, duration_mean, duration_std)
    
    # Create a mapping of feature tuple to node index
    feature_to_node_idx = {}
    node_indices = []
    current_node_idx = 0
    
    # First pass: assign node indices based on unique features
    for idx, features in enumerate(agent_node_features):
        # Convert features to tuple for hashability
        feature_tuple = tuple(features.tolist())
        if feature_tuple not in feature_to_node_idx:
            feature_to_node_idx[feature_tuple] = current_node_idx
            current_node_idx += 1
        node_indices.append(feature_to_node_idx[feature_tuple])
    
    # Create edges between consecutive visits
    start_times = agent_data['start_datetime'].tolist()
    for i in range(len(node_indices) - 1):
        src_idx = node_indices[i]
        dst_idx = node_indices[i + 1]
        time_diff = (start_times[i + 1] - start_times[i]).total_seconds()
        edge_src.append(src_idx)
        edge_dst.append(dst_idx)
        edge_time_diff.append(time_diff)

    # Get unique features for the graph
    unique_features = []
    seen_features = set()
    for features in agent_node_features:
        feature_tuple = tuple(features.tolist())
        if feature_tuple not in seen_features:
            seen_features.add(feature_tuple)
            unique_features.append(features)
    unique_features = torch.stack(unique_features)
    
    G = dgl.graph((edge_src, edge_dst), num_nodes=len(feature_to_node_idx))
    edge_time_diff_tensor = torch.tensor(edge_time_diff, dtype=torch.float32).unsqueeze(1)
    G.edata['time_diff'] = edge_time_diff_tensor
    G.ndata['attr'] = unique_features

    # save graph as dgl graph
    dgl.save_graphs(f'{save_dir}/agent_{agent_id}.dgl', G)

In [None]:
# read one graph
data_path = '/data/home/umang/Trajectory_project/anomaly_traj_data/haystac_anomaly_data1/saved_graphs'
data_name = 'mtm_all_features_stop_point_graphs_test'
agent_id = 1
data_folder = f'{data_path}/{data_name}/agent_{agent_id}.dgl'
G, _ = dgl.load_graphs(f'{data_folder}')
G[0].ndata['attr'].shape

In [None]:
G[0]

In [None]:
 #Steps TODO:

# 1. Load the graph.
# 2. Treat the graph as a trajectory of stop points.
# 3. Distinguish features as part of "observations" and "actions".
# 4. Store the agent movement as observations and actions in the buffer npz format.

In [None]:
# get all column names in train_poi_df
train_poi_df.columns
print(train_poi_df['agent_id'].nunique())  

## MAKING MTM DATA

In [None]:
# put agent id, poi id, latitude, longitude in observations

# get all the unique agent ids
unique_agent_ids = train_poi_df['agent_id'].unique()
# for each unique agent id, get all corresponding poi ids, names, latitudes, longitudes and store it in the format of episode[observations]=shape (1001, 4), dtype float32

# Initialize the observation dictionary
observations = {}

# Get unique agents
agents = data['agent_id'].unique()

# Iterate over each agent
for agent_id in agents:
    agent_data = data[data['agent_id'] == agent_id]
    agent_id_append = np.full((len(agent_data), 1), agent_id)
    # Extract POI IDs, names, latitudes, and longitudes
    #poi_ids = agent_data['poi_id'].values
    latitudes = agent_data['latitude'].values
    longitudes = agent_data['longitude'].values
    
    # get start and end datetimes
    start_datetimes = agent_data['start_datetime'].values
    end_datetimes = agent_data['end_datetime'].values

    # get if weekday or weekend
    is_weekday = agent_data['start_datetime'].dt.dayofweek < 5
    is_weekday = is_weekday.astype(int).values.reshape(-1, 1)

    # get the day of the week, time of day, minute of the hour for start and end datetimes
    start_hours = agent_data['start_datetime'].dt.hour.values.reshape(-1, 1)
    start_minutes = agent_data['start_datetime'].dt.minute.values.reshape(-1, 1)
    end_hours = agent_data['end_datetime'].dt.hour.values.reshape(-1, 1)
    end_minutes = agent_data['end_datetime'].dt.minute.values.reshape(-1, 1)
    start_days = agent_data['start_datetime'].dt.dayofweek.values.reshape(-1, 1)
    end_days = agent_data['end_datetime'].dt.dayofweek.values.reshape(-1, 1)

    # Combine the extracted data into a single array
    combined_data = np.column_stack((agent_id_append, is_weekday, start_days, end_days,
                                      start_hours, start_minutes, end_hours, end_minutes))
        
    # Store the combined data in the observations dictionary
    observations[agent_id] = combined_data

# Print the observations
# for agent_id, obs in observations.items():
#     print(f"Agent ID: {agent_id}")
#     print(f"Observations: shape {obs.shape}, dtype {obs.dtype}")
#     print(obs)

In [None]:
# Initialize the actions dictionary
actions = {}
# for each unique agent id, get all corresponding poi ids, names, latitudes, longitudes and store it in the format of episode[observations]=shape (1001, 4), dtype float32

# Get unique agents
agents = data['agent_id'].unique()

# Iterate over each agent
for agent_id in agents:
    agent_data = data[data['agent_id'] == agent_id]
    
    # get activity types for actions
    actions[agent_id] = agent_data['act_types'].apply(lambda x: np.array(x, dtype=np.float32)).values

# Print the actions
for agent_id, act in actions.items():
    print(f"Agent ID: {agent_id}")
    print(f"Actions: shape {act.shape}, dtype {act.dtype}")
    print(act)


In [None]:
# Get the length of the longest trajectory
max_length = max(len(obs) for obs in observations.values())
max_length

In [None]:
# Pad the observations and actions to the maximum length, for the padded regions, use zeros and create attention masks for padded regions
# Initialize dictionaries to store padded observations and actions
padded_observations = {}
padded_actions = {}

# Initialize dictionaries to store attention masks
attention_masks = {}

# Iterate over each agent
for agent_id in agents:
    # Get the observations and actions for the current agent
    obs = observations[agent_id]
    act = actions[agent_id]
    
    # Get the length of the current trajectory
    length = len(obs)
    
    # Pad the observations and actions to the maximum length
    padded_obs = np.zeros((max_length, obs.shape[1]), dtype=np.float32)
    padded_act = np.zeros((max_length, act.shape[1]), dtype=np.float32)
    
    # Create an attention mask for the padded regions
    mask = np.zeros(max_length, dtype=np.float32)
    
    # Fill the padded observations and actions
    padded_obs[:length] = obs
    padded_act[:length] = act
    mask[:length] = 1.0
    
    # Store the padded observations, actions, and attention masks
    padded_observations[agent_id] = padded_obs
    padded_actions[agent_id] = padded_act
    attention_masks[agent_id] = mask


In [None]:
# CHECK padded observations and actions and attention masks are all same shapes for all agents
# for agent_id, obs in padded_observations.items():
#     print(f"Agent ID: {agent_id}")
#     print(f"Padded Observations: shape {obs.shape}, dtype {obs.dtype}")
#     print(f"Padded Actions: shape {act.shape}, dtype {act.dtype}")
#     print(f"Attention Mask: shape {mask.shape}, dtype {mask.dtype}")

In [None]:
attention_masks[1]

In [None]:
# save the observations and actions in the buffer npz format
# Save the observations and actions
sub_dir='obs7_act1'
save_dir = f'/data/home/umang/Trajectory_project/anomaly_traj_data/haystac_anomaly_data1/saved_agent_episodes_new/{sub_dir}'
os.makedirs(save_dir, exist_ok=True)

In [None]:
# Combinedly store the observations and actions as 1 agent episode in the buffer npz format
for agent_id in agents:
    obs = observations[agent_id]
    act = actions[agent_id]
    att_mask = attention_masks[agent_id]
    np.savez(f'{save_dir}/agent_{agent_id}.npz', obs=obs, act=act, att_mask=att_mask)


In [None]:
# read the npz file
# Load the observations and actions
agent_id = 1
data_folder = f'{save_dir}/agent_{agent_id}.npz'
loaded_data = np.load(data_folder)
obs = loaded_data['obs']
act = loaded_data['act']
att_mask = loaded_data['att_mask']

print(f"Agent ID: {agent_id}")
print(f"Observations: shape {obs.shape}, dtype {obs.dtype}")
print(obs)
print(f"Actions: shape {act.shape}, dtype {act.dtype}")
print(act)
print(f"Attention Masks: shape {att_mask.shape}, dtype {att_mask.dtype}")
print(att_mask)

In [None]:
# max and min for each dimension of the node features
max_features = G[0].ndata['attr'].max(dim=0)[0]
min_features = G[0].ndata['attr'].min(dim=0)[0]

In [None]:
max_features

In [None]:
min_features

In [None]:
# pick 3 random words from word2vec and get their embeddings    
import random
random_words = random.sample(list(reduced_embeddings.keys()), 3)
print(random_words)
# get the embeddings
random_embeddings = [reduced_embeddings[word] for word in random_words]

# check the distance between all 3 embeddings using euclidean distance
#from scipy.spatial.distance import cosine
distances = []
for i in range(3):
    for j in range(i+1, 3):
        distances.append(np.linalg.norm(random_embeddings[i] - random_embeddings[j]))
        #distances.append(cosine(random_embeddings[i], random_embeddings[j]))
print(distances)



In [None]:
# check which word in the dictionary is closest to the list of random embeddings
min_dist = 100
min_word = ''
for rand_embedding in random_embeddings:
    print(rand_embedding)
for rand_embedding in random_embeddings:
    rand_embedding_tensor = torch.tensor(rand_embedding)
    for word in reduced_embeddings.keys():
        embedding_tensor = torch.tensor(reduced_embeddings[word])

        # use cosine similarity to get the distance between the embeddings
        dist = torch.nn.functional.cosine_similarity(embedding_tensor, rand_embedding_tensor, dim=0)
        if dist < min_dist:
            min_dist = dist
            min_word = word
    print(min_dist, min_word)


In [None]:
from annoy import AnnoyIndex

# Prepare data
keys = list(embedding_tensor.keys())
dim = len(embedding_tensor[keys[0]])
index = AnnoyIndex(dim, 'euclidean')

for i, key in enumerate(keys):
    index.add_item(i, embedding_tensor[key])

# Build the index
index.build(10)

# Perform search
nearest_idx = index.get_nns_by_vector(search_vector, 1)[0]
closest_key = keys[nearest_idx]

print("Closest Vector Key:", closest_key)

In [None]:
# Get the number of files in the saved directory
import os
sub_dir='obs4_act1'
save_dir = f'/data/home/umang/Trajectory_project/anomaly_traj_data/haystac_anomaly_data1/saved_agent_episodes/{sub_dir}'
len(os.listdir(save_dir))

In [None]:
# Read the saved npz files
# get the file names with minimum and maximum rows (Stop points) in the observations
import numpy as np
min_rows = float('inf')
max_rows = 0
min_file = ''
max_file = ''
for file in os.listdir(save_dir):
    data = np.load(f'{save_dir}/{file}')
    obs = data['obs']
    if obs.shape[0] < min_rows:
        min_rows = obs.shape[0]
        min_file = file
    if obs.shape[0] > max_rows:
        max_rows = obs.shape[0]
        max_file = file

In [None]:
# print the file names with minimum and maximum rows
print(f"File with minimum rows: {min_file}, rows: {min_rows}")
print(f"File with maximum rows: {max_file}, rows: {max_rows}")