In [1]:
from pathlib import Path

path = Path("../data")
dataset_name = "tky"
dataset_path = path / dataset_name
json_files = sorted(dataset_path.glob("user_profile_*.json"), key=lambda x: int(x.stem.split("_")[-1]))
json_files[:5]

[PosixPath('../data/tky/user_profile_1.json'),
 PosixPath('../data/tky/user_profile_2.json'),
 PosixPath('../data/tky/user_profile_3.json'),
 PosixPath('../data/tky/user_profile_4.json'),
 PosixPath('../data/tky/user_profile_5.json')]

In [2]:
import json

user_profiles = {}

for json_file in json_files:
    user_id = json_file.stem.split("_")[-1]
    with open(json_file, "r") as f:
        user_profile = json.load(f)
    
    age, gender, education, socioeco = user_profile["attributes"]
    traits = ", ".join(user_profile["traits"])
    preferences = ", ".join(user_profile["preferences"])
    routines = ", ".join(user_profile["routines"])
    user_profile_str = user_profile["user_profile"]

    system_prompt = f"""You are user {user_id} and your basic information is as follows:
Age: {age}; Gender: {gender}; Education: {education}; SocioEco: {socioeco}.
You have the following traits: {traits}.
You have the following preferences: {preferences}.
You have the following routines: {routines}.
{user_profile_str}"""
    
    user_profiles[user_id] = system_prompt

In [3]:
user2idx = {user_id: idx for idx, user_id in enumerate(user_profiles.keys())}
idx2user = {v: k for k, v in user2idx.items()}

In [4]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

  from tqdm.autonotebook import tqdm, trange


In [5]:
user_profile_embeddings = model.encode(list(user_profiles.values()), batch_size=1, show_progress_bar=True)

Batches: 100%|██████████| 2281/2281 [06:54<00:00,  5.51it/s]


In [6]:
import torch
from tqdm.auto import tqdm

user2similarity = {}
top_k = 100

for user_id in tqdm(user_profiles):
    current_user_idx = user2idx[user_id]
    user_profile_embedding = user_profile_embeddings[current_user_idx]
    # calculate cosine similarity between user_profile_embedding and all other user_profile_embeddings
    similarity_scores = model.similarity(user_profile_embedding, user_profile_embeddings)[0]
    # sort similarity scores in descending order
    scores, indices = torch.sort(similarity_scores, descending=True)
    for score, idx in zip(scores, indices):
        if idx == current_user_idx:
            continue
        user2similarity[user_id] = user2similarity.get(user_id, [])
        user2similarity[user_id].append((str(idx2user[idx.item()]), round(score.item(), 4)))
        if len(user2similarity[user_id]) == top_k:
            break

100%|██████████| 2281/2281 [00:30<00:00, 75.58it/s]


In [7]:
with open(dataset_path / "profile_similarities.json", "w") as f:
    json.dump(user2similarity, f)