In [None]:
from typing import Dict, List, Any
import random

In [None]:
random.seed(0)
user_count_per_segment = 1000
item_count_per_segment = 50
seq_lengths = [50]
genders = ["M", "F"]
ages = [20, 30, 40, 50, 60]
# genres = ["M1", "M2", "M3", "M4", "M5", "F1", "F2", "F3", "F4", "F5"]
# genres = ["M1", "M2", "M3", "E1", "E2", "E3", "F1", "F2", "F3"]
# genres = ["M1", "M2", "M3", "M4", "M5", "E1", "E2", "E3", "E4", "E5", "F1", "F2", "F3", "F4", "F5"]
genres = ["M", "E", "F"]
base_year = 2020
years = [1960, 1970, 1980, 1990, 2000]
test_length = 20

raw_sequences: Dict[str, List[str]] = {}
test_raw_sequences: Dict[str, List[str]] = {}
users: Dict[str, Dict[str, Any]] = {}
items: Dict[str, Dict[str, Any]] = {}

def get_user_name(user_id: int, gender: str, age: int, seq_length: int):
    return f"u_{user_id}_{gender}_{age}_{seq_length}_{gender}{user_id % 5 + 1}"

def get_item_name(item_id: int, genre: str, year: int):
    return f"v_{item_id}_{genre}_{year}"

for gender in genders:
    for age in ages:
        for seq_length in seq_lengths:
            for user_id in range(user_count_per_segment):
                user_name = get_user_name(user_id, gender, age, seq_length)

                # user-metadata
                users[user_name] = {
                    "gender": gender,
                    "age": age,
                }

                genre_weight = []
                if gender == "M":
                    # genre_weight = [0.200] * 3 + [0.100] * 3 + [0.033] * 3
                    genre_weight = [0.600] + [0.300] + [0.100]
                else:
                    # genre_weight = [0.033] * 3 + [0.100] * 3 + [0.200] * 3
                    genre_weight = [0.100] + [0.300] + [0.600]
                year_weight = list(map(lambda e: 0.50 if e == base_year - age else 0.10, years))

                # trains
                genre_list = random.choices(genres, genre_weight, k=seq_length)
                year_list = random.choices(years, year_weight, k=seq_length)
                item_id_list = sorted([random.randint(0, item_count_per_segment - 1) for _ in range(seq_length)])
                sequences = list(map(lambda x: get_item_name(*x), zip(item_id_list, genre_list, year_list)))
                raw_sequences[user_name] = sequences

                # tests
                genre_list = random.choices(genres, genre_weight, k=test_length)
                year_list = random.choices(years, year_weight, k=test_length)
                item_id_list = sorted([random.randint(0, item_count_per_segment - 1) for _ in range(test_length)])
                test_sequences = list(map(lambda x: get_item_name(*x), zip(item_id_list, genre_list, year_list)))
                test_raw_sequences[user_name] = test_sequences

for genre in genres:
    for year in years:
        for item_id in range(item_count_per_segment):
            item_name = get_item_name(item_id, genre, year)
            items[item_name] = {
                "genre": genre,
                "year": year
            }

In [None]:
import pandas as pd

user_df = pd.DataFrame(users.values(), index=users.keys())
item_df = pd.DataFrame(items.values(), index=items.keys())

train_sequences = list(map(lambda s: " ".join(s), raw_sequences.values()))
train_df = pd.DataFrame(train_sequences, index=raw_sequences.keys(), columns=["sequence"])
test_sequences = list(map(lambda s: " ".join(s), test_raw_sequences.values()))
test_df = pd.DataFrame(test_sequences, index=test_raw_sequences.keys(), columns=["sequence"])

user_df.index.name = "user_id"
item_df.index.name = "item_id"
train_df.index.name = "user_id"
test_df.index.name = "user_id"

data_dir = "../data/toydata-paper/"

user_df.to_csv(data_dir + "users.csv")
item_df.to_csv(data_dir + "items.csv")
train_df.to_csv(data_dir + "train.csv")
test_df.to_csv(data_dir + "test.csv")