In [None]:
import os
import sys
import csv
import numpy as np
import pandas as pd
from datetime import datetime

encoding = "ISO-8859-1"

data_dir = os.path.join("..", "data")
artists_tsv = os.path.join(data_dir, "artists.dat")
tags_tsv = os.path.join(data_dir, "tags.dat")
user_artists_tsv = os.path.join(data_dir, "user_artists.dat")
user_friends_tsv = os.path.join(data_dir, "user_friends.dat")
user_taggedartists_tsv = os.path.join(data_dir, "user_taggedartists-timestamps.dat")

In [None]:
artists_df = pd.read_table(artists_tsv, encoding=encoding)  # tab-separated
artists_df.head()

In [None]:
tags_df = pd.read_table(tags_tsv, encoding=encoding)
tags_df.head()

In [None]:
friends_df = pd.read_table(user_friends_tsv, encoding=encoding)  # Undirected user-user edges
friends_df.head()

In [None]:
listened_df = pd.read_table(user_artists_tsv, encoding=encoding)  # Directed user-item(artist) edges with weight (listening count)
listened_df.head()

In [None]:
assigned_df = pd.read_table(user_taggedartists_tsv, encoding=encoding)  # Directed user-tag-item edges with timestamp (ms from epoch)
assigned_df.head()

In [None]:
datetime.fromtimestamp(1238536800)

In [None]:
artists_ids = artists_df["id"].unique().tolist()
len(artists_ids), max(artists_ids)

In [None]:
tag_ids = tags_df["tagID"].unique().tolist()
len(tag_ids), max(tag_ids)

In [None]:
user_ids = set(friends_df["userID"].unique().tolist() + friends_df["friendID"].unique().tolist())
len(user_ids), max(user_ids)

# Create mappings from user/artist/tag IDs to global node IDs

In [None]:
num_users, num_artists, num_tags = len(user_ids), len(artists_ids), len(tag_ids)

In [None]:
user_id2global, artist_id2global, tag_id2global = dict(), dict(), dict()

for i, n in enumerate(user_ids):
    user_id2global[n] = i

for i, n in enumerate(artists_ids):
    artist_id2global[n] = int(num_users + i)

for i, n in enumerate(tag_ids):
    tag_id2global[n] = int(num_users + num_artists + i)

In [None]:
artists_df["global_id"] = artists_df["id"].map(artist_id2global)
tags_df["global_tagID"] = tags_df["tagID"].map(tag_id2global)

friends_df["global_userID"] = friends_df["userID"].map(user_id2global)
friends_df["global_friendID"] = friends_df["friendID"].map(user_id2global)

listened_df["global_userID"] = listened_df["userID"].map(user_id2global)
listened_df["global_artistID"] = listened_df["artistID"].map(artist_id2global)

assigned_df = assigned_df[assigned_df["artistID"].isin(artist_id2global.keys())]
assigned_df["global_userID"] = assigned_df["userID"].map(user_id2global)
assigned_df["global_artistID"] = assigned_df["artistID"].map(artist_id2global)
assigned_df["global_tagID"] = assigned_df["tagID"].map(tag_id2global)
assigned_df["timestamp_s"] = assigned_df["timestamp"] // 1000  # ms -> s from epoch

assigned_df.head()

In [None]:
nan_df = assigned_df[assigned_df["global_artistID"].isna()]
nan_df

# Write node and edge list CSV files with global node IDs

In [None]:
users_csv = os.path.join(data_dir, "users.csv")
artists_csv = os.path.join(data_dir, "artists.csv")
tags_csv = os.path.join(data_dir, "tags.csv")

user_artists_csv = os.path.join(data_dir, "user_artists.csv")
user_friends_csv = os.path.join(data_dir, "user_friends.csv")
user_taggedartists_csv = os.path.join(data_dir, "user_taggedartists.csv")

In [None]:
orig_user_ids = list(user_id2global.keys())
global_user_ids = [int(user_id2global[n]) for n in orig_user_ids]
user_id_df = pd.DataFrame({"global_id": global_user_ids, "orig_id": orig_user_ids})
user_id_df.to_csv(users_csv, index=False)
user_id_df.head()

In [None]:
artist_id_df = artists_df[["global_id", "id"]].rename(columns={"id": "orig_id"}).astype(int)
artist_id_df.to_csv(artists_csv, index=False)
artist_id_df.head()

In [None]:
tag_id_df = tags_df[["global_tagID", "tagID"]].rename(columns={"global_tagID": "global_id", "tagID": "orig_id"}).astype(int)
tag_id_df.to_csv(tags_csv, index=False)
tag_id_df.head()

In [None]:
user_artists_df = listened_df[["global_userID", "global_artistID", "weight"]].rename(columns={"global_userID": "user_id", "global_artistID": "artist_id"}).astype(int)
user_artists_df.to_csv(user_artists_csv, index=False)
user_artists_df.head()

In [None]:
user_friends_df = friends_df[["global_userID", "global_friendID"]].rename(columns={"global_userID": "user_id", "global_friendID": "friend_id"}).astype(int)
user_friends_df.to_csv(user_friends_csv, index=False)
user_friends_df.head()

In [None]:
user_taggedartists_df = assigned_df[["global_userID", "global_tagID", "global_artistID", "timestamp_s"]]\
    .rename(columns={"global_userID": "user_id", "global_artistID": "artist_id", "global_tagID": "tag_id", "timestamp_s": "timestamp"})
print(user_taggedartists_df.isnull().any())
user_taggedartists_df.astype(int).to_csv(user_taggedartists_csv, index=False)
user_taggedartists_df.head()