In [1]:
import json
from pathlib import Path

input_path = Path("../../LLM4POI/datasets")
json_path = Path("../data")
dataset = "nyc"

dataset_path = input_path / dataset / "preprocessed"
profiles_path = json_path / dataset

def get_user_profile(user_id: str, profiles_path: Path) -> dict:
    user_profile_file = profiles_path / f"user_profile_{user_id}.json"
    with user_profile_file.open() as f:
        user_profile = json.load(f)
    return user_profile

In [2]:
with open(dataset_path / "train_qa_pairs_kqt.json") as f:
    raw_data_train = json.load(f)

with open(dataset_path / "test_qa_pairs_kqt.txt") as f:
    raw_data_test = f.read().splitlines()

raw_data_test = [{"question": q, "answer": a} for line in raw_data_test for (q, a) in [line.split("<answer>:")]]
raw_data_test = [datum for datum in raw_data_test if "<question>:" in datum["question"]]

In [6]:
import re

LLAMA_PROMPT = """<s>[INST] <<SYS>>
{profile}
<</SYS>>

{instruction} [/INST] {answer} </s>"""

def create_system_prompt(user_profile: dict, user_id: str) -> str:
    age, gender, education, socioeco = user_profile["attributes"]
    traits = ", ".join(user_profile["traits"])
    preferences = ", ".join(user_profile["preferences"])
    routines = ", ".join(user_profile["routines"])
    user_profile_str = user_profile["user_profile"]
    system_prompt = f"""You are user {user_id} and your basic information is as follows:
Age: {age}; Gender: {gender}; Education: {education}; SocioEco: {socioeco}.
You have the following traits: {traits}.
You have the following preferences: {preferences}.
You have the following routines: {routines}.
{user_profile_str}"""
    return system_prompt

def process_qa_pair(question: str, answer: str) -> dict:
    # remove <answer> prefix
    answer = answer.replace("<answer>: ", "").strip()

    # remove historical data and <question> prefix
    question = question.replace("<question>: ", "")
    current_trajectory_prompt = question.split("There is also historical data:")[0].strip()

    # get instruction
    question_prompt = re.search(
        r"(Given the data, At (.+?), Which POI id will user (\d+) visit\? Note that POI id is an integer in the range from 0 to (\d+).)",
        question,
    ).group(1)
    instruction = current_trajectory_prompt + "\n" + question_prompt

    # get user profile from id
    user_id = re.match(r"The following data is a trajectory of user (\d+):", current_trajectory_prompt).group(1)
    user_profile = get_user_profile(user_id, profiles_path)

    # create system prompt
    system_prompt = create_system_prompt(user_profile, user_id)

    # create llama SFT prompt
    llama_prompt = LLAMA_PROMPT.format(profile=system_prompt, instruction=instruction, answer=answer)

    return {"system_prompt": system_prompt, "inputs": instruction, "targets": answer, "llama_prompt": llama_prompt}

In [9]:
from tqdm.auto import tqdm

train_data = [process_qa_pair(**datum) for datum in tqdm(raw_data_train)]
test_data = [process_qa_pair(**datum) for datum in tqdm(raw_data_test)]

  0%|          | 0/11022 [00:00<?, ?it/s]

100%|██████████| 11022/11022 [00:07<00:00, 1546.25it/s]
100%|██████████| 1429/1429 [00:01<00:00, 1397.46it/s]


In [10]:
train_data[0]

{'system_prompt': 'You are user 0 and your basic information is as follows:\nAge: adult; Gender: male; Education: college & beyond; SocioEco: middle.\nYou have the following traits: extroverted, agreeable, conscientious, emotionally stable, open.\nYou have the following preferences: socializing at bars, enjoying diverse cuisines, visiting coffee shops, exploring flea markets, watching movies.\nYou have the following routines: frequent visits to bars and restaurants, regular coffee outings, attending social events on weekends.\nUser 0 is an outgoing adult male, likely in his late 20s to early 30s, who enjoys an active social life. He frequently visits bars, and restaurants, and is open to trying various cuisines, as indicated by his choices of American, French, and burger joints. His preference for social settings suggests a high level of extroversion and agreeability. User 0 also appears conscientious, as he maintains a routine that involves balancing social activities, including atten

In [8]:
test_data[0]

{'system_prompt': 'You are user 1 and your basic information is as follows:\nAge: adult; Gender: male; Education: college & beyond; SocioEco: middle.\nYou have the following traits: extroverted, agreeable, conscientious, emotionally stable, open.\nYou have the following preferences: frequent coffee shops, enjoys gym workouts, likes dining out, shops at department stores, frequent subway user.\nYou have the following routines: visits gym regularly, stops by coffee shops after meals, shops at department stores on weekends, uses public transportation frequently.\nUser 1 is an outgoing male in his late 20s who actively engages with his community through visits to various local amenities. He frequents coffee shops and enjoys socializing over a cup of coffee, which suggests a preference for casual social environments and a love for good food and drink. Regular trips to the gym indicate a health-conscious lifestyle, coupled with a disciplined routine that highlights his conscientious nature. 