In [None]:
import re
import csv
import glob
import math
import json
import pandas as pd
from datetime import datetime

# The purpose of this script is to read through all files containing
# augmented rows and create two distinct dictionaries: mbid_rows and
# normal_rows depending on whether an mbid is available for the song.
# There should be 1,505,514 unique tracks and thus augmented rows in 
# total. We then iterate through all listening events of the 1K data
# set and extract sessions. This is done on a user-by-user basis: We
# take all events associated with each user and iterate through them.
# If 7200 seconds (120 minutes, as per paper) has passed between two 
# events, we consider them as part of two distinct sessions. 

## Last.fm (CSV)

In [None]:
def replace_all(text, dic):
    for i, j in dic.items():
        text = text.replace(i, j)
    return text

# Handle all tags.
all_tags = {}
tag_counter = 1
lastfm = {}
lastfm_index = 1

with open("data/lastfm.csv", "r", encoding="utf-8") as source:
    reader = csv.reader(source, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    remove = {"[": "", "'": "" , "]": "", '"': ""}
    for row in reader:
        total_listeners = row[0]
        total_playcount = row[1]
        duration_lastfm = row[2]
        # We clean the tags such that we do not inadvertently include the URL.
        # This could be solved by changing the way the lastfm spider writes to 
        # disk, but as we have not yet done that, these steps are necessary.
        tags = replace_all(str(row[3:]).split("https://")[0], remove).split(", ")
        tags = [tag.strip() for tag in tags]
        tags = [tag if len(tag) > 0 else None for tag in tags]
        tags = list(filter(None, tags))

        temp_tags = []
        if tags:    
            for tag in tags:
                if tag not in all_tags:
                    all_tags[tag] = tag_counter
                    temp_tags.append(tag_counter)
                    tag_counter += 1
                else:
                    temp_tags.append(all_tags[tag])
            
        track_id = row[-1]
        
        lastfm[track_id] = {"track_id": track_id,
                            "lastfm_index": lastfm_index,
                            "total_listeners": total_listeners,
                            "total_playcount": total_playcount,
                            "duration_lastfm": duration_lastfm,
                            "tags": temp_tags}
        lastfm_index += 1     

In [None]:
import json
with open("tag_ids.json", "w") as out:
    json.dump(all_tags, out)

In [None]:
# Handle all tags.
all_tags = {}
tag_counter = 1

with open("combinedlastfm.json", "r") as source:
    lastfm = json.load(source)

# Append all unique tracks and assign IDs
for _, values in lastfm.items():
    track_tags = []
    for tag in values["tags"]:
        if tag not in all_tags:
            all_tags[tag] = tag_counter
            track_tags.append(tag_counter)
            tag_counter += 1
        else:
            track_tags.append(all_tags[tag])
    
    values["tags"] = track_tags

## Spotify

In [None]:
keys = ["release_date", "spotify_id", "track_id", "acousticness", 
        "danceability", "energy", "instrumentalness", "pitch_class", 
        "liveness", "loudness", "mode", "speechiness", "tempo", 
        "time_signature", "valence","duration_spotify"]

spotify = {}
spotify_index = 1
with open("data/spotify.csv", "r") as csvfile:
    reader = csv.reader(csvfile, delimiter=",")
    for row in reader:
        content = dict(zip(keys, row))
        content["spotify_index"] = spotify_index
        spotify[content["track_id"]] = content
        spotify_index += 1

## Unique Tracks

We read through the unique-tracks.csv file which has extracted all unique tracks from the listening events.

In [None]:
with open("../../Downloads/unique-tracks.csv", "r", encoding="utf-8") as csvfile:
    raw_tracks = list(csv.reader(csvfile))

unique_tracks = {}
for index, entry in enumerate(raw_tracks):
    track_id = index
    track_name = entry[2].strip()
    artist_name = entry[0]
    key = "{}{}".format(artist_name, track_name)
    unique_tracks[key] = {"track_id": track_id,
                          "track_name": track_name,
                          "artist_name": artist_name,
                          "key": key}

In [None]:
print("Raw", len(raw_tracks), "Unique", len(unique_tracks))

## Listening Events

In [None]:
def find_track_id(artist_name, track_name): 
    try:
        return str(unique_tracks["{}{}".format(artist_name, track_name)]["track_id"])
    # Some tracks return a KeyError per https://github.com/x775/SW10/issues
    except KeyError:
        return None
        
            
# Read the listening events of the 1K dataset. This translates
# to roughly 19 million events. For each event, we associate it
# to a given user as well as locate the track_id of each song. 
source = "../../Downloads/lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv"

users = {}
ukeys = ["user_id", "timestamp", "artist_mbid", 
          "artist_name", "mbid_track", "track_name"]
with open(source, "r", encoding="utf-8") as data:
    reader = csv.reader(data, delimiter="\t", quoting=csv.QUOTE_NONE)
    for row in reader:
        user_id = int(row[0].split("_")[1])
        if user_id in users:
            users[user_id].append(dict(zip(ukeys, row)))
        else:
            users[user_id] = [dict(zip(ukeys, row))]

        # In both cases, we add the track_id from our augmented rows.
        track_id = find_track_id(users[user_id][-1]["artist_name"].strip(),
                                 users[user_id][-1]["track_name"].strip())
        
        if track_id:
            users[user_id][-1].update({"track_id": track_id})
        else:
            users[user_id][-1].update({"track_id": None})

In [None]:
import json
with open("processedusers.json", "w") as out:
    json.dump(users, out)

## Sessions

In [None]:
def find_average_duration(track_id):
     def compute(ld, sd):
          if ld and not sd: return int(ld)
          if sd and not ld: return int(sd)
          if not sd and not ld: return None
          return math.ceil((int(ld) + int(sd)) / 2)
     
     ld = lastfm[track_id]["duration_lastfm"]\
          if lastfm[track_id]["duration_lastfm"] and len(lastfm[track_id]["duration_lastfm"]) > 1\
          else None

     sd = None
     if track_id in spotify:     
          sd = spotify[track_id]["duration_spotify"]\
               if spotify[track_id]["duration_spotify"] and len(spotify[track_id]["duration_spotify"]) > 1\
               else None

     return compute(ld, sd)

In [None]:
cutoff = 7200
allowed_fade = 5
import datetime

In [None]:
sessions = {}
SESSION_ID = 0
total = len(users)
userssessions = {}
for index, user in enumerate(users):
    prev_gap = 0
    
    # Obtain events in chronological order.
    user_events_r = list(reversed(users[user]))

    # Grab the timestamp from the first event.
    prev_timestamp = pd.to_datetime(user_events_r[0]["timestamp"])

    # Compute the duration of the first track.
    try:
        average_duration = find_average_duration(str(user_events_r[0]["track_id"]))
        prev_track_missing = 0
    except KeyError:
        # If we receive a KeyError when attempting to find the duration, it 
        # means that the song in question does not exist as outlined here:
        # https://github.com/x775/SW10/issues. We thus set missing = 1.
        average_duration = 0
        prev_track_missing = 1

    # If average_duration returns None, we set duration to 0.
    prev_track_duration = round(average_duration / 1000) if average_duration else 0

    # Prepare the first session of the user. 
    current_session = [(user_events_r[0]["track_id"],
                        user_events_r[0]["timestamp"])]

    user_sessions = []
    for event in user_events_r[1:]:
        gap = 0
        duration = 0

        try:
            duration = find_average_duration(str(event["track_id"]))
            current_track_missing = 0
        except KeyError:
            current_track_missing = 1
            
        timestamp = pd.to_datetime(event["timestamp"])

        # If the current song has been played after a break of more than the duration 
        # of the previous song and the specified cutoff (default 7200 seconds / 120 
        # minutes), we consider the song part of a new session. 
        if timestamp > (prev_timestamp 
                        + datetime.timedelta(seconds=prev_track_duration)
                        + datetime.timedelta(seconds=cutoff)):
            # We assume that the previous song was completed in full and has no gap.
            current_session[-1] = current_session[-1] + (1.0, prev_track_missing, prev_gap)
            sessions[SESSION_ID] = current_session
            user_sessions.append(SESSION_ID)
            # We prepare a new session.
            current_session = [(event["track_id"], event["timestamp"])]
            SESSION_ID += 1
        else:
            # Seconds between current and previous track starting.
            difference = (timestamp - prev_timestamp).seconds

            if prev_track_duration > 0:
                if difference >= prev_track_duration:
                    abs_seconds = difference - prev_track_duration
                    if abs_seconds <= allowed_fade:
                        # In case of continuous stream.
                        percentage_played = 1.0
                    elif abs_seconds >= allowed_fade:
                        # In case of a gap (i.e. pause and/or skipped track(s))
                        percentage_played = 1.0
                        gap = abs_seconds
                else:
                    # In case of track not played in full.
                    percentage_played = round(difference / prev_track_duration, 2)
                    if percentage_played > 1:
                        percentage_played = 1.0
            else:
                percentage_played = 1.0
                gap = difference
            
            # Update the percentage_played of the previous entry.
            current_session[-1] = current_session[-1] + (percentage_played, prev_track_missing, prev_gap,)
            # Append the current entry to the current session.
            current_session.append((event["track_id"], event["timestamp"]),)

        prev_gap = gap
        prev_timestamp = timestamp
        prev_track_duration = round(duration / 1000) if duration else 0
        prev_track_missing = current_track_missing
        
    userssessions[index] = user_sessions
    print("Completed {}/{}".format(index, total))

In [None]:
with open("sessions_28march.json", "w") as out:
    json.dump(sessions, out)

In [None]:
songstags = {}
for _, value in lastfm.items():
    songstags[value["track_id"]] = value["tags"]

In [None]:
with open("track_ids_and_tag_ids.json", "w") as out:
    json.dump(songstags, out)

In [None]:
with open("user_sessions.json", "w") as out:
    json.dump(userssessions, out)

In [None]:
import json
complete_sessions = {}
with open("data/sessions_complete.json", "r") as source:
    complete_sessions = json.load(source)

In [None]:
for key, session in complete_sessions.items():
    for entry in session:
        if 770512 in entry:
            print("found something")
            print(entry)
            print(key)

In [None]:
list(users.keys())[:10]