In [1]:
base_model = "Llama-2-7b-longlora-32k-merged"
dataset_id = "w11wo/FourSquare-NYC-POI"

In [2]:
from datasets import load_dataset

dataset = load_dataset(dataset_id)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import re

train_df = dataset["train"].to_pandas()
test_df = dataset["test"].to_pandas()

get_user_id = lambda x: re.match(r"The following data is a trajectory of user (\d+):.*", x).group(1)
train_df["user"] = train_df["inputs"].apply(get_user_id)
test_df["user"] = test_df["inputs"].apply(get_user_id)

In [4]:
# get number of trajectories per user
user_trajectories = train_df["user"].value_counts().sort_values(ascending=False)

# split users into three categories: very active, normal, inactive
# very active users are the top 1/3 of users by number of trajectories
# inactive users are the bottom 1/3 of users by number of trajectories
# normal users are the rest
n = len(user_trajectories) // 3
very_active_users = user_trajectories.nlargest(n).index
inactive_users = user_trajectories.nsmallest(n).index

train_df["user_activity"] = "normal"
train_df.loc[train_df["user"].isin(very_active_users), "user_activity"] = "very_active"
train_df.loc[train_df["user"].isin(inactive_users), "user_activity"] = "inactive"

In [5]:
train_df

Unnamed: 0,system_prompt,inputs,targets,llama_prompt,user,user_activity
0,You are user 0 and your basic information is a...,The following data is a trajectory of user 0: ...,"At 2012-06-11 14:05:49, user 0 will visit POI ...",<s>[INST] <<SYS>>\nYou are user 0 and your bas...,0,inactive
1,You are user 0 and your basic information is a...,The following data is a trajectory of user 0: ...,"At 2012-07-04 21:00:14, user 0 will visit POI ...",<s>[INST] <<SYS>>\nYou are user 0 and your bas...,0,inactive
2,You are user 0 and your basic information is a...,The following data is a trajectory of user 0: ...,"At 2012-04-14 19:15:07, user 0 will visit POI ...",<s>[INST] <<SYS>>\nYou are user 0 and your bas...,0,inactive
3,You are user 0 and your basic information is a...,The following data is a trajectory of user 0: ...,"At 2012-04-22 13:33:27, user 0 will visit POI ...",<s>[INST] <<SYS>>\nYou are user 0 and your bas...,0,inactive
4,You are user 1 and your basic information is a...,The following data is a trajectory of user 1: ...,"At 2012-05-12 11:18:58, user 1 will visit POI ...",<s>[INST] <<SYS>>\nYou are user 1 and your bas...,1,normal
...,...,...,...,...,...,...
11017,You are user 1046 and your basic information i...,The following data is a trajectory of user 104...,"At 2012-10-27 01:47:42, user 1046 will visit P...",<s>[INST] <<SYS>>\nYou are user 1046 and your ...,1046,very_active
11018,You are user 1046 and your basic information i...,The following data is a trajectory of user 104...,"At 2012-11-07 09:37:41, user 1046 will visit P...",<s>[INST] <<SYS>>\nYou are user 1046 and your ...,1046,very_active
11019,You are user 1046 and your basic information i...,The following data is a trajectory of user 104...,"At 2012-04-26 10:16:06, user 1046 will visit P...",<s>[INST] <<SYS>>\nYou are user 1046 and your ...,1046,very_active
11020,You are user 1046 and your basic information i...,The following data is a trajectory of user 104...,"At 2012-04-28 17:06:45, user 1046 will visit P...",<s>[INST] <<SYS>>\nYou are user 1046 and your ...,1046,very_active


In [6]:
user2activity = dict(zip(train_df["user"], train_df["user_activity"]))

In [7]:
# for test_df, get user activity from train_df
test_df["user_activity"] = test_df["user"].apply(lambda user: user2activity[user])
test_df

Unnamed: 0,system_prompt,inputs,targets,llama_prompt,user,user_activity
0,You are user 1 and your basic information is a...,The following data is a trajectory of user 1: ...,"At 2013-01-03 10:13:27, user 1 will visit POI ...",<s>[INST] <<SYS>>\nYou are user 1 and your bas...,1,normal
1,You are user 1 and your basic information is a...,The following data is a trajectory of user 1: ...,"At 2013-01-10 10:54:51, user 1 will visit POI ...",<s>[INST] <<SYS>>\nYou are user 1 and your bas...,1,normal
2,You are user 1 and your basic information is a...,The following data is a trajectory of user 1: ...,"At 2013-02-13 14:16:40, user 1 will visit POI ...",<s>[INST] <<SYS>>\nYou are user 1 and your bas...,1,normal
3,You are user 5 and your basic information is a...,The following data is a trajectory of user 5: ...,"At 2013-02-03 07:58:50, user 5 will visit POI ...",<s>[INST] <<SYS>>\nYou are user 5 and your bas...,5,very_active
4,You are user 6 and your basic information is a...,The following data is a trajectory of user 6: ...,"At 2013-01-13 09:12:35, user 6 will visit POI ...",<s>[INST] <<SYS>>\nYou are user 6 and your bas...,6,inactive
...,...,...,...,...,...,...
1424,You are user 1038 and your basic information i...,The following data is a trajectory of user 103...,"At 2013-02-01 06:42:40, user 1038 will visit P...",<s>[INST] <<SYS>>\nYou are user 1038 and your ...,1038,very_active
1425,You are user 1040 and your basic information i...,The following data is a trajectory of user 104...,"At 2013-01-08 10:19:08, user 1040 will visit P...",<s>[INST] <<SYS>>\nYou are user 1040 and your ...,1040,normal
1426,You are user 1040 and your basic information i...,The following data is a trajectory of user 104...,"At 2013-01-22 08:51:41, user 1040 will visit P...",<s>[INST] <<SYS>>\nYou are user 1040 and your ...,1040,normal
1427,You are user 1046 and your basic information i...,The following data is a trajectory of user 104...,"At 2013-01-04 17:15:09, user 1046 will visit P...",<s>[INST] <<SYS>>\nYou are user 1046 and your ...,1046,very_active


In [8]:
import json

dataset_id = dataset_id.split("/")[-1]

with open(f"../results/{base_model}-{dataset_id}.json") as f:
    results = json.load(f)

test_df["predictions"] = results["predictions"]
test_df["labels"] = results["targets"]

In [9]:
test_df

Unnamed: 0,system_prompt,inputs,targets,llama_prompt,user,user_activity,predictions,labels
0,You are user 1 and your basic information is a...,The following data is a trajectory of user 1: ...,"At 2013-01-03 10:13:27, user 1 will visit POI ...",<s>[INST] <<SYS>>\nYou are user 1 and your bas...,1,normal,697,2297
1,You are user 1 and your basic information is a...,The following data is a trajectory of user 1: ...,"At 2013-01-10 10:54:51, user 1 will visit POI ...",<s>[INST] <<SYS>>\nYou are user 1 and your bas...,1,normal,3544,3544
2,You are user 1 and your basic information is a...,The following data is a trajectory of user 1: ...,"At 2013-02-13 14:16:40, user 1 will visit POI ...",<s>[INST] <<SYS>>\nYou are user 1 and your bas...,1,normal,589,367
3,You are user 5 and your basic information is a...,The following data is a trajectory of user 5: ...,"At 2013-02-03 07:58:50, user 5 will visit POI ...",<s>[INST] <<SYS>>\nYou are user 5 and your bas...,5,very_active,4309,4309
4,You are user 6 and your basic information is a...,The following data is a trajectory of user 6: ...,"At 2013-01-13 09:12:35, user 6 will visit POI ...",<s>[INST] <<SYS>>\nYou are user 6 and your bas...,6,inactive,1513,1513
...,...,...,...,...,...,...,...,...
1424,You are user 1038 and your basic information i...,The following data is a trajectory of user 103...,"At 2013-02-01 06:42:40, user 1038 will visit P...",<s>[INST] <<SYS>>\nYou are user 1038 and your ...,1038,very_active,356,356
1425,You are user 1040 and your basic information i...,The following data is a trajectory of user 104...,"At 2013-01-08 10:19:08, user 1040 will visit P...",<s>[INST] <<SYS>>\nYou are user 1040 and your ...,1040,normal,692,1002
1426,You are user 1040 and your basic information i...,The following data is a trajectory of user 104...,"At 2013-01-22 08:51:41, user 1040 will visit P...",<s>[INST] <<SYS>>\nYou are user 1040 and your ...,1040,normal,488,488
1427,You are user 1046 and your basic information i...,The following data is a trajectory of user 104...,"At 2013-01-04 17:15:09, user 1046 will visit P...",<s>[INST] <<SYS>>\nYou are user 1046 and your ...,1046,very_active,4399,4061


In [10]:
from sklearn.metrics import accuracy_score

# calculate accuracy for each user activity category
accuracy = {}

for user_activity in ["inactive", "normal", "very_active"]:
    _test_df = test_df[test_df["user_activity"] == user_activity]
    accuracy[user_activity] = accuracy_score(_test_df["labels"].to_list(), _test_df["predictions"].to_list())

In [11]:
accuracy

{'inactive': 0.20426829268292682,
 'normal': 0.27205882352941174,
 'very_active': 0.27020506634499397}

In [14]:
test_df["trajectory_length"] = test_df["inputs"].apply(lambda x: len(re.findall(r"user \d+ visited POI id", x)))

# split trajectory lengths into three categories: short, middle, long
# long trajectories are the top 1/3 of trajectory lengths
# short trajectories are the bottom 1/3 of trajectory lengths
n = len(test_df) // 3
long_trajectories = test_df["trajectory_length"].nlargest(n).values
short_trajectories = test_df["trajectory_length"].nsmallest(n).values

test_df["trajectory_type"] = "middle"
test_df.loc[test_df["trajectory_length"].isin(long_trajectories), "trajectory_type"] = "long"
test_df.loc[test_df["trajectory_length"].isin(short_trajectories), "trajectory_type"] = "short"

In [15]:
# calculate accuracy for each trajectory_type
accuracy = {}

for trajectory_type in ["short", "middle", "long"]:
    _test_df = test_df[test_df["trajectory_type"] == trajectory_type]
    accuracy[trajectory_type] = accuracy_score(_test_df["labels"].to_list(), _test_df["predictions"].to_list())

In [16]:
accuracy

{'short': 0.19633943427620631,
 'middle': 0.26848249027237353,
 'long': 0.3117338003502627}