In [None]:
import pandas as pd
import json
from datetime import timedelta

In [None]:
# load json files

with open("./data/raw_data/all_sessions.json", "r") as source:
    sessions = json.load(source)

with open("./data/raw_data/user_ids_and_session_ids.json", "r") as source:
    users = json.load(source)

with open("./data/raw_data/trackidsandstringtags.json", "r") as source:
    tracks = json.load(source)

In [None]:
# load already processed dataframes

with open("./data/sessions_10sessions_2plays_tf.json", "r") as source:
    sessions_df = pd.read_json(source, orient="index")

with open("./data/users_10sessions_2plays_tf.json", "r") as source:
    users_df = pd.read_json(source, orient="index")

with open("./data/tracks_10sessions_2plays_tf.json", "r") as source:
    track_df = pd.read_json(source, orient="index")

with open("./data/tags_10sessions_2plays_tf.json", "r") as source:
    tags_df = pd.read_json(source, orient="index")

In [None]:
# create dataframes

users_df = pd.DataFrame([[int(key), value] for (key, value) in users.items()], columns=["user_id", "sessions"])
users_df.set_index("user_id", inplace=True)
sessions_df = pd.DataFrame([[int(key), value] for (key, value) in sessions.items()], columns=["session_id", "session"])
sessions_df.set_index("session_id", inplace=True)
track_df = pd.DataFrame([[int(key), value] for (key, value) in tracks.items()], columns=["track_id", "tags"])

In [None]:
# remove all sessions with less than 5 songs or invalid songs

sessions_df = sessions_df[sessions_df["session"].apply(lambda l: len(l) >= 5) == True]
sessions_df = sessions_df[sessions_df["session"].apply(lambda l: any(None in sl for sl in l)) == False]
sessions_df = sessions_df[sessions_df["session"].apply(lambda l: any(sl[3] == 1 for sl in l)) == False]

In [None]:
# remove all sessions that starts with a track with no tags (tensorflow compatibility)

track_id_to_tags_dict = dict(zip(track_df.track_id.values.tolist(), track_df.tags.values.tolist()))

def find_first_track_no_tags(session):
    tags = track_id_to_tags_dict[int(session[0][0])]
    if not any(tags):
        return True
    else:
        return False

sessions_df = sessions_df[sessions_df["session"].apply(find_first_track_no_tags) == False]

In [None]:
# get playcount for each track across all sessions

track_ids = track_df.track_id.values.tolist()
track_plays = [int(track[0]) for session in sessions_df.session.values.tolist() for track in session]
initial_plays = [0] * len(track_ids)
track_plays_dict = dict(zip(track_ids, initial_plays))

for tid in track_plays:
    track_plays_dict[tid] += 1

track_df["plays"] = track_df["track_id"].apply(lambda tid: track_plays_dict[tid])

In [None]:
# get tracks with more than one play

track_df = track_df[track_df["plays"] > 1]

In [None]:
# build valid tracks dict

valid_ids = track_df.track_id.values.tolist()
idxs = [i for i in range(max(valid_ids) + 1)]
valid_dict = dict(zip(idxs, [False] * len(idxs)))

for tid in valid_ids:
    valid_dict[tid] = True

In [None]:
# remove sessions with single play songs

def contains_invalid(session):
    for track in session:
        if valid_dict[int(track[0])] == False:
            return True
    return False

sessions_df = sessions_df[sessions_df["session"].apply(contains_invalid) == False]

In [None]:
# remove invalid sessions from users

session_ids = sessions_df.index.values

def remove_invalid_sessions(sessions):
    exists = []
    for session_id in sessions:
        if session_id in session_ids:
            exists.append(session_id)
    return exists

users_df["clean_sessions"] = users_df["sessions"].apply(lambda l: remove_invalid_sessions(l))

In [None]:
# remove users with no sessions

users_df.drop(users_df[users_df["clean_sessions"].apply(lambda l: len(l)) == 0].index, inplace=True)

In [None]:
# create subset of user sessions

def get_six_month_history(sessions):
    if len(sessions) == 1:
        return sessions
    start_ts = pd.to_datetime(sessions_df.loc[sessions[0]]["session"][0][1])
    for i in range(1, len(sessions)):
        current_ts = pd.to_datetime(sessions_df.loc[sessions[i]]["session"][0][1])
        if (current_ts - start_ts) > timedelta(days=183):
            return sessions[:i]
    return sessions

users_df["sessions_subset"] = users_df["clean_sessions"].apply(get_six_month_history)

In [None]:
# get session count for users

users_df["session_count"] = users_df["sessions_subset"].apply(lambda l: len(l))

In [None]:
# remove users with less than 10 sessions

users_df = users_df[users_df["session_count"] >= 10]

In [None]:
# sample users to create smaller dataset

users_df = users_df.sample(200)

In [None]:
# remove sessions not in user sessions subset

subset_sessions = [item for sublist in users_df.sessions_subset.values.tolist() for item in sublist]
subset_sessions_df = sessions_df.drop(sessions_df.loc[~sessions_df.index.isin(subset_sessions)].index)

In [None]:
# remove songs not in any subset sessions

valid_songs = list(set([int(track[0]) for session in subset_sessions_df.session.values.tolist() for track in session]))
valid_songs.sort()
subset_track_df = track_df.drop(track_df[~track_df["track_id"].isin(valid_songs)].index)

In [None]:
# sort track df by ids and create a new index column

subset_track_df = subset_track_df.sort_values("track_id").reset_index().drop("index", axis=1)
subset_track_df.index = subset_track_df.index + 1

In [None]:
# create tag dictionary and get tag ids

total_tags = [item for sublist in subset_track_df.tags.values.tolist() for item in sublist]
unique_tags = list(set(total_tags))

tags_dict = {}
for index, tag in enumerate(unique_tags):
    tags_dict[tag] = index + 1

def get_tag_ids(tags):
    ids = []
    for tag in tags:
        ids.append(tags_dict[tag])
    return ids

subset_track_df["tags"] = subset_track_df["tags"].apply(get_tag_ids)

In [None]:
# create tag dataframe

tags_df = pd.DataFrame([[key, int(value)] for (key, value) in tags_dict.items()], columns=["tag", "tag_id"])

In [None]:
track_idxs = subset_track_df.index.values.tolist()
track_ids = subset_track_df.track_id.values.tolist()

track_idxs_dict = dict(zip(track_ids, track_idxs))

In [None]:
track_tags = subset_track_df.tags.values.tolist()

track_tags_dict = dict(zip(track_idxs, track_tags))

In [None]:
# get track idxs from ids

def get_idxs(session):
    session_ids = [int(track[0]) for track in session]
    session_idxs = [track_idxs_dict[sid] for sid in session_ids]
    return session_idxs

subset_sessions_df["track_idxs"] = subset_sessions_df.session.apply(get_idxs)

In [None]:
# get tag idxs from ids

def get_tag_idxs(track_idxs):
    tags_idxs = []
    for idx in track_idxs:
        tags_idxs.append(track_tags_dict[idx].copy())
    longest_tags = len(max(tags_idxs, key=len)) # find song with most tags
    for tags in tags_idxs: # pad tags of other songs with 0
        tags.extend([0] * (longest_tags - len(tags)))
    return tags_idxs

subset_sessions_df["tags_idxs"] = subset_sessions_df.track_idxs.apply(get_tag_idxs)

In [None]:
# get skip features

def get_skip(session):
    skips = []
    for track in session:
        if track[2] < 0.9:
            skips.append(2)
        else:
            skips.append(1)
    return skips

subset_sessions_df["skips"] = subset_sessions_df["session"].apply(get_skip)

In [None]:
# get skip and gap features

def get_gap_skip(session):
    skips_and_gaps = []
    for track in session:
        if track[2] < 0.9:
            skips_and_gaps.append(2)
        elif track[4] >= 30:
            skips_and_gaps.append(3)
        else:
            skips_and_gaps.append(1)
    return skips_and_gaps

subset_sessions_df["action"] = subset_sessions_df["session"].apply(get_gap_skip)

In [None]:
# build session histories

user_sessions_list = users_df["sessions_subset"].values.tolist()
history_dict = {}

for user in user_sessions_list:
    for i in range(len(user)):
        history_dict[user[i]] = user[:i]

subset_sessions_df["history"] = subset_sessions_df.apply(lambda row: history_dict[row.name], axis=1)

In [None]:
subset_sessions_df.drop("session", axis=1, inplace=True)

In [None]:
users_df.drop("sessions", axis=1, inplace=True)
users_df.drop("clean_sessions", axis=1, inplace=True)
users_df.drop("session_count", axis=1, inplace=True)

In [None]:
subset_sessions_df.drop("skips", axis=1, inplace=True)

In [None]:
# save data

users_df.to_json("./data/skips_and_gaps/users_10sessions_2plays_skipsgaps_tf.json", orient="index")
subset_sessions_df.to_json("./data/skips_and_gaps/sessions_10sessions_2plays_skipsgaps_tf.json", orient="index")
subset_track_df.to_json("./data/skips_and_gaps/tracks_10sessions_2plays_skipsgaps_tf.json", orient="index")
tags_df.to_json("./data/skips_and_gaps/tags_10sessions_2plays_skipsgaps_tf.json", orient="index")