In [None]:
import json
import pandas as pd

# The implementation by Ludewig et al. assumes music data to be of the format:
#
#         UserId	SessionId	ItemId	Time	ArtistId
#         27063	    1889046     2691760	1402997433	337496
#         27063	    1889046	    2691717	1402997784	337496
#         ...
#
# However, they do not use the ArtistId column in their implemenation outside
# of the ArtistDiversity and ArtistCoherence evaluation metrics. We choose to 
# ignore these as we do not employ those metrics. 

with open("Baselines/STABR/data/all_sessions.json", "r") as source:
    sessions = json.load(source)

with open("Baselines/STABR/data/user_ids_and_session_ids.json", "r") as source:
    user_ids = json.load(source)

In [None]:
tracks = []
for s in sessions:
    for t in sessions[s]:
        tracks.append(int(t[0]))
print(len(tracks))
print(len(set(tracks)))

In [None]:
with open("all_interactions.csv", "w") as out:
    out.write("UserId,SessionId,ItemId,Time\n")

    for user, session_ids in user_ids.items():
        for session_id in session_ids:
            for entry in sessions[str(session_id)]:
                out.write("{},{},{},{}\n".format(user, session_id, entry[0], entry[1]))
        print("Completed ", user)

In [None]:
import math
TRAIN_TEST_SPLIT = 0.7
for user, session_ids in user_ids.items():
    # Start by generating 70% train
    split_index = math.ceil(TRAIN_TEST_SPLIT * len(session_ids))
    with open("preprocessed_train.csv", "w") as out:
        out.write("SessionId,ItemId,Time\n")
        for session_id in session_ids[:split_index]:
            for entry in sessions[str(session_id)]:
                timestamp = int(pd.to_datetime(entry[1]).value / 1e9)
                out.write("{}\t{}\t{}\n".format(session_id, int(entry[0]), timestamp))
    # ... and then generate a 30% test
    with open("preprocessed_test.csv", "w") as out:
        out.write("SessionId,ItemId,Time\n")
        for session_id in session_ids[split_index:]:
            for entry in sessions[str(session_id)]:
                timestamp = int(pd.to_datetime(entry[1]).value / 1e9)
                out.write("{}\t{}\t{}\n".format(session_id, int(entry[0]), timestamp))

    print("Completed ", user)

In [None]:
with open("preprocessed_ludewig.csv", "w") as out:
    out.write("UserId,SessionId,ItemId,Time\n")

    for user, session_ids in user_ids.items():
        for session_id in session_ids:
            for entry in sessions[str(session_id)]:
                timestamp = int(pd.to_datetime(entry[1]).value / 1e9)
                out.write("{},{},{},{}\n".format(int(user), session_id, int(entry[0]), timestamp))
        print("Completed ", user)

In [None]:
with open("preprocessed_ludewig.csv", "w") as out:
    session_id = 0
    out.write("UserId,SessionId,ItemId,Time\n")
    for user, session in sessions.items():
        for entry in session:
            timestamp = int(pd.to_datetime(entry[1]).value / 1e9)
            out.write("{},{},{},{}\n".format(int(user), session_id, int(entry[0]), timestamp))
            session_id += 1
        print("Done", user)