## Preparation

We start by loading our generated .txt files. 

We are interested in also extracting timestamps such that we can compute skips and gaps.

In [2]:
import json
import pandas as pd
from pathlib import Path

In [3]:
raw_training = []
raw_testing = []
for path in Path("data").rglob("*.txt"):
    if "train" in str(path): 
        raw_training.append(path) 
    else:
        raw_testing.append(path)

In [3]:
raw_training

[PosixPath('data/30music-200ks_train_full.4.txt'),
 PosixPath('data/30music-200ks_train_full.3.txt'),
 PosixPath('data/30music-200ks_train_full.2.txt'),
 PosixPath('data/30music-200ks_train_full.0.txt'),
 PosixPath('data/30music-200ks_train_full.1.txt')]

In [4]:
raw_testing

[PosixPath('data/30music-200ks_test.4.txt'),
 PosixPath('data/30music-200ks_test.3.txt'),
 PosixPath('data/30music-200ks_test.2.txt'),
 PosixPath('data/30music-200ks_test.0.txt'),
 PosixPath('data/30music-200ks_test.1.txt')]

We read the raw .idomaar files.

This allows us to match tracks and tags for later.

In [3]:
all_tracks = {}
with open("/Users/jo/Documents/ThirtyMusic/entities/tracks.idomaar", "r", encoding="utf-8") as source:
    for i, line in enumerate(source):
        _, track_id, _, response, extended = line.split("\t")
        response = json.loads(response)
        extended = json.loads(extended)
        try:
            tag_ids = [tag["id"] for tag in extended["tags"]]
        except TypeError:
            tag_ids = []
        all_tracks[track_id] = {"track_id": track_id,
                                "duration": response["duration"],
                                "artist_id": extended["artists"][0]["id"],
                                "playcount": response["playcount"],
                                "tag_ids": tag_ids
                               }

In [4]:
all_tags = []
with open("/Users/jo/Documents/ThirtyMusic/entities/tags.idomaar", "r", encoding="utf-8") as source:
    for i, line in enumerate(source):
        all_tags.append(line.split("/t")[1])

In [7]:
tracks = []
for track, data in all_tracks.items():
    tracks.append(int(data["track_id"]))

print(len(set(tracks)))

4519105


We then proceed to check number of users in the respective training and test files.

In [8]:
for i in range(5):
    print("Testing for", i)
    with open(raw_training[i], "r") as source:
        train_lines = source.readlines()[1:]
    with open(raw_testing[i], "r") as source:
        test_lines = source.readlines()[1:]

    # Extract all train items.
    train_users = set()
    train_items = set()
    train_artists = set()
    for line in train_lines:
        user_id, session_id, item_id, timestamp, artist_id = line.split("\t")
        train_users.add(user_id)
        train_items.add(item_id)
        train_artists.add(artist_id)

    # Extract all test items.
    test_users = set()
    test_items = set()
    test_artists = set()
    for line in test_lines:
        user_id, session_id, item_id, timestamp, artist_id = line.split("\t")
        test_users.add(user_id)
        test_items.add(item_id)
        test_artists.add(artist_id)
    
    print("Users")
    print(len(train_users))
    print(len(test_users))
    print("Intersection: {} ({})".format(len(train_users.intersection(test_users)), 
                                         100 * (len(train_users.intersection(test_users)) / len(test_users))))
    print("Items")
    print(len(train_items))
    print(len(test_items))
    print("Intersection: {} ({})".format(len(train_items.intersection(test_items)), 
                                         100 * (len(train_items.intersection(test_items)) / len(test_items))))
    print("Artists")
    print(len(train_artists))
    print(len(test_artists))
    print("Intersection: {} ({})".format(len(train_artists.intersection(test_artists)), 
                                         100 * (len(train_artists.intersection(test_artists)) / len(test_artists))))

    print("\n")

Testing for 0
Users
11115
1544
Intersection: 889 (57.57772020725389)
Items
218665
20318
Intersection: 20318 (100.0)
Artists
33063
5040
Intersection: 5040 (100.0)


Testing for 1
Users
9334
973
Intersection: 582 (59.815005138746145)
Items
207245
15146
Intersection: 15146 (100.0)
Artists
31529
3991
Intersection: 3991 (100.0)


Testing for 2
Users
9025
852
Intersection: 516 (60.56338028169014)
Items
200469
14472
Intersection: 14472 (100.0)
Artists
30353
3743
Intersection: 3743 (100.0)


Testing for 3
Users
9580
932
Intersection: 578 (62.01716738197425)
Items
216054
16305
Intersection: 16305 (100.0)
Artists
31366
4116
Intersection: 4116 (100.0)


Testing for 4
Users
9337
862
Intersection: 535 (62.064965197215784)
Items
210736
14026
Intersection: 14026 (100.0)
Artists
30947
3586
Intersection: 3586 (100.0)




We then proceeed to create our actual session files. This is a combination of the existing .idomaar data preprocessing file and the combine.ipynb file from the augmented lastfm1k-set.

In [4]:
import math
from datetime import datetime

In [45]:
def return_duration(track_id):
    try:
        duration = all_tracks[str(track_id)]["duration"]
        if duration and duration > 0:
            return round(duration / 1000)
        return 0
    except KeyError:
        print("Encountered keyerror with", track_id)
        return 0

def return_timestamp(timestamp):
    return datetime.utcfromtimestamp(int(timestamp))\
                   .strftime('%Y-%m-%d %H:%M:%S')


allowed_fade = 5
all_files = [(raw_training, "training"), (raw_testing, "testing")]
for files, version in all_files:
    # We first iterate through the training files, then testing.
    for file_index, file_name in enumerate(files):
        # We start by creating a users-dictionary of all users and their
        # associated listening events in the current file. Note that the
        # train/test split has already been compute through session-rec.
        uids_sids = {}
        if "train" in str(file_name):
            file_index = str(file_name).split("full.")[1].replace(".txt","")
        else:
            file_index = str(file_name).split("test.")[1].replace(".txt","")

        with open(file_name, "r") as source:
            for line in source.readlines()[1:]:
                user_id, session_id, _, _, _ = [int(e) for e in line.split("\t")]
                if user_id in uids_sids:
                    if session_id not in uids_sids[user_id]:
                        uids_sids[user_id].append(session_id)
                else:
                    uids_sids[user_id] = [session_id]

        with open("data/user_ids_and_session_ids_{}_{}.json"\
                  .format(version, file_index), "w") as out:
            json.dump(uids_sids, out)
            print("Created user_id and session_id dictionary for", file_name)    
        
        # We continue by extracting all sessions and their events.
        raw_items = set()
        sessions = {}
        with open(file_name, "r") as source:
            for line in source.readlines()[1:]:
                _, session_id, item_id, timestamp, _ = [int(e) for e in line.split("\t")]
                raw_items.add(item_id)
                if session_id in sessions:
                    if (item_id, timestamp) not in sessions[session_id]:
                        sessions[session_id].append((item_id, timestamp))
                else:
                    sessions[session_id] = [(item_id, timestamp)]

        # We now proceed to separating the different sessions
        # and computing the internal skips and gaps using the
        # duration values obtained from the .idomaar files.
        updated_sessions = {}
        for session_id, session_values in sessions.items():
            prev_gap = 0
            prev_track_duration = return_duration(session_values[0][0])
            prev_timestamp = pd.to_datetime(return_timestamp(session_values[0][1]))
            prev_track_missing = 0  # to match lastfm1k set.
            
            current_session = [(session_values[0][0], session_values[0][1])]
            
            for event in session_values[1:]:
                track_id = int(event[0])
                timestamp = int(event[1])

                gap = 0
                duration = return_duration(track_id)
                current_track_missing = 0 # to match lastfm1k set.
                timestamp = pd.to_datetime(return_timestamp(timestamp))
                # Seconds between current and previous track starting.
                difference = (timestamp - prev_timestamp).seconds

                if prev_track_duration > 0:
                    if difference >= prev_track_duration:
                        abs_seconds = difference - prev_track_duration
                        if abs_seconds <= allowed_fade:
                            # In case of continuous stream.
                            percentage_played = 1.0
                        elif abs_seconds >= allowed_fade:
                            # In case of a gap (i.e. pause and/or skipped track(s))
                            percentage_played = 1.0
                            gap = abs_seconds
                    else:
                        # In case of track not played in full.
                        percentage_played = round(difference / prev_track_duration, 2)
                        if percentage_played > 1:
                            percentage_played = 1.0
                else:
                    percentage_played = 1.0
                    gap = 0

                # Update the percentage_played of the previous entry.
                current_session[-1] = current_session[-1] + (percentage_played, 
                                                             prev_track_missing, 
                                                             prev_gap,)
                # Append the current entry to the current session.
                current_session.append((event[0], event[1]),)

                # Update values
                prev_gap = gap
                prev_timestamp = timestamp
                prev_track_duration = round(duration / 1000) if duration else 0
                prev_track_missing = current_track_missing

            # Add the last song, assume it has been completed in full and has no gap.
            current_session[-1] = current_session[-1] + (1.0, prev_track_missing, prev_gap,)
            
            # Add the updated session information.
            updated_sessions[session_id] = current_session
        
        with open("data/augmented_sessions_{}_{}.json"\
                  .format(version, file_index), "w") as out:
            json.dump(updated_sessions, out)
            print("Finished {} set {}".format(file_name, file_index))

Created user_id and session_id dictionary for data/30music-200ks_train_full.4.txt
Created user_id and session_id dictionary for data/30music-200ks_train_full.3.txt
Created user_id and session_id dictionary for data/30music-200ks_train_full.2.txt
Created user_id and session_id dictionary for data/30music-200ks_train_full.0.txt
Created user_id and session_id dictionary for data/30music-200ks_train_full.1.txt
Created user_id and session_id dictionary for data/30music-200ks_test.4.txt
Created user_id and session_id dictionary for data/30music-200ks_test.3.txt
Created user_id and session_id dictionary for data/30music-200ks_test.2.txt
Created user_id and session_id dictionary for data/30music-200ks_test.0.txt
Created user_id and session_id dictionary for data/30music-200ks_test.1.txt


## Verification

Before proceeding, we check that we still have the same intersection values.

In [44]:
PATH = "data/augmented_sessions_"
for i in range(5):
    with open("{}training_{}.json".format(PATH, i), "r") as source:
        processed_training = json.load(source)

    with open("{}testing_{}.json".format(PATH, i), "r") as source:
        processed_testing = json.load(source)

    processed_training_items = set()
    for _, values in processed_training.items():
        for entry in values:
            processed_training_items.add(entry[0])

    processed_testing_items = set()
    for _, values in processed_testing.items():
        for entry in values:
            processed_testing_items.add(entry[0])

    print("Parsed ", i)
    print(len(processed_training_items))
    print(len(processed_testing_items))
    print("Intersection: {} ({})"\
          .format(len(processed_training_items.intersection(processed_testing_items)), 
                  100 * (len(processed_training_items.intersection(processed_testing_items))\
                  / len(processed_testing_items))))

Parsed  0
216054
16305
Intersection: 16305 (100.0)
Parsed  1
210736
14026
Intersection: 14026 (100.0)
Parsed  2
200469
14472
Intersection: 14472 (100.0)
Parsed  3
207245
15146
Intersection: 15146 (100.0)
Parsed  4
218665
20318
Intersection: 20318 (100.0)


## Conversion

At this point we have data of format


```
{"5730": [[1402838, 1418824941, 1.0, 0, 0], ... 
```

We are interested in achieving the following files:

* sessions.json 

`id: {"track_idxs": [...], "tags_idxs": [[...], [...],...], "skips": [...]})`

* tags.json
* tracks.json
* users.json

Additionally, we want to reset the ids such that they are in proper order.


In [5]:
prepared_training = []
prepared_testing = []
for path in Path("data/intermediatejson").rglob("*.json"):
    if "augmented" not in str(path):
        continue
    if "train" in str(path): 
        prepared_training.append(path) 
    else:
        prepared_testing.append(path)

In [5]:
prepared_testing = ['data/intermediatejson/augmented_sessions_testing_0.json',
                    'data/intermediatejson/augmented_sessions_testing_1.json',
                    'data/intermediatejson/augmented_sessions_testing_2.json',
                    'data/intermediatejson/augmented_sessions_testing_3.json',
                    'data/intermediatejson/augmented_sessions_testing_4.json']

In [6]:
prepared_training = ['data/intermediatejson/augmented_sessions_training_0.json',
                     'data/intermediatejson/augmented_sessions_training_1.json',
                     'data/intermediatejson/augmented_sessions_training_2.json',
                     'data/intermediatejson/augmented_sessions_training_3.json',
                     'data/intermediatejson/augmented_sessions_training_4.json']

In [7]:
updated = []
all_prepared = prepared_training + prepared_testing
for prepared in all_prepared:
    version = str(prepared).split("_sessions_")[1].split(".")[0]
    with open(prepared, "r") as source:
        data = json.load(source)
    
    # We start by extracting track_idxs, we then add tags_idxs
    # before combining all skip values. At this point we have
    # everything in the same dictionary. 
    updated_dict = {}
    for session_id, values in data.items():
        track_idxs = [entry[0] for entry in values]
        tags_idxs = []
        for track in track_idxs:
            track_tags = all_tracks[str(track)]["tag_ids"]
            track_tags += [0] * (5 - len(track_tags))
            tags_idxs.append(track_tags)
        skips = [entry[2] for entry in values]
        updated_dict[session_id] = {"raw_values": values,
                                    "track_idxs": track_idxs,
                                    "tags_idxs": tags_idxs,
                                    "skips": skips}
        
    # Add the updated dictionary to a temporary list.
    updated.append((version, updated_dict))
    print("Added ", version)

Added  training_0
Added  training_1
Added  training_2
Added  training_3
Added  training_4
Added  testing_0
Added  testing_1
Added  testing_2
Added  testing_3
Added  testing_4


In [9]:
processed_training = updated[:5]
procsssed_testing = updated[5:]

for i in range(5):
    p_training = processed_training[i][1]
    p_testing = procsssed_testing[i][1]

    train_items = set()
    for _, values in p_training.items():
        for track in values["track_idxs"]:
            train_items.add(track)

    test_items = set()
    for _, values in p_testing.items():
        for track in values["track_idxs"]:
            test_items.add(track)
    
    print("Items")
    print(len(train_items))
    print(len(test_items))
    print("Intersection: {} ({})".format(len(train_items.intersection(test_items)), 
                                         100 * (len(train_items.intersection(test_items)) / len(test_items))))

Items
216054
16305
Intersection: 16305 (100.0)
Items
210736
14026
Intersection: 14026 (100.0)
Items
200469
14472
Intersection: 14472 (100.0)
Items
207245
15146
Intersection: 15146 (100.0)
Items
218665
20318
Intersection: 20318 (100.0)


In [8]:
for i in range(5):
    matched = []
    refreshed_ids = []
    for entry in updated:
        if str(i) in entry[0]:
            matched.append(entry)
    
    reordered_tracks = {}
    reordered_tracks_id = 1
    reordered_tags = {}
    reordered_tags_id = 1
    for match in matched:
        if "training" in match[0]:
            for key, value in match[1].items():
                # Add key to reordered sessions.
                #if key not in reordered_sessions:
                #    reordered_sessions[key] = reordered_sessions_id
                #    reordered_sessions_id += 1

                # Iterate all tracks associated with current session.
                for track in value["track_idxs"]:
                    if track not in reordered_tracks:
                        reordered_tracks[track] = reordered_tracks_id
                        reordered_tracks_id += 1
                        
                # Iterate all tags associated with current session.
                for tags in value["tags_idxs"]:
                    for tag in tags:
                        # We wish to skip 0 as this is our padding.
                        if tag not in reordered_tags and tag != 0:
                            reordered_tags[tag] = reordered_tags_id
                            reordered_tags_id += 1
                            
            print("Extracted IDs from ", match[0]) 
    
    # We now reorder the IDs in both the training and testing splits.
    for match in matched:
        new_data = {}
        for key, value in match[1].items():
            # We grab the new ID for tracks: All tracks from test exist in train.
            new_tracks = [reordered_tracks[track] for track in value["track_idxs"]]
            new_tags = []
            for tags in value["tags_idxs"]:
                new_tags.append([tag if tag == 0 else reordered_tags[tag] 
                                for tag in tags])

            new_skips = []
            for skip in value["skips"]:
                if skip > 0.9:
                    new_skips.append(1)
                else:
                    new_skips.append(2)

            new_data[int(key)] = {"track_idxs": new_tracks,
                                  "tags_idxs": new_tags,
                                  "skips": new_skips}
        
        with open("sessions_30music_{}.json".format(match[0]), "w") as out:
            json.dump(new_data, out)
            print("Refreshed for ", match[0])
        
        refreshed_ids.append((match[0], new_data))

    for version, values in refreshed_ids:
        all_tags = set()
        for key, entry in values.items():
            for tags in entry["tags_idxs"]:
                for tag in tags:
                    all_tags.add(tag)
        print(version, len(all_tags))
    
    for version, values in refreshed_ids:
        all_tracks = set()
        for key, entry in values.items():
            for track in entry["track_idxs"]:
                all_tracks.add(track)
        print(version, len(all_tracks))


Extracted IDs from  training_0
Refreshed for  training_0
Refreshed for  testing_0
training_0 54170
testing_0 7620
training_0 216054
testing_0 16305
Extracted IDs from  training_1
Refreshed for  training_1
Refreshed for  testing_1
training_1 52441
testing_1 6898
training_1 210736
testing_1 14026
Extracted IDs from  training_2
Refreshed for  training_2
Refreshed for  testing_2
training_2 50504
testing_2 6682
training_2 200469
testing_2 14472
Extracted IDs from  training_3
Refreshed for  training_3
Refreshed for  testing_3
training_3 51153
testing_3 7108
training_3 207245
testing_3 15146
Extracted IDs from  training_4
Refreshed for  training_4
Refreshed for  testing_4
training_4 52616
testing_4 8607
training_4 218665
testing_4 20318


In [10]:
reordered_tracks = {}
reordered_tracks_id = 1
reordered_tags = {}
reordered_tags_id = 1
#reordered_sessions = {}
#reordered_sessions_id = 1
# We only read in for training as the training data holds
# 100% of the tracks and tags the testing data does. 
for entry in updated:
    if "training" in entry[0]:
        for key, value in entry[1].items():
            # Add key to reordered sessions.
            #if key not in reordered_sessions:
            #    reordered_sessions[key] = reordered_sessions_id
            #    reordered_sessions_id += 1

            # Iterate all tracks associated with current session.
            for track in value["track_idxs"]:
                if track not in reordered_tracks:
                    reordered_tracks[track] = reordered_tracks_id
                    reordered_tracks_id += 1
                    
            # Iterate all tags associated with current session.
            for tags in value["tags_idxs"]:
                for tag in tags:
                    # We wish to skip 0 as this is our padding.
                    if tag not in reordered_tags and tag != 0:
                        reordered_tags[tag] = reordered_tags_id
                        reordered_tags_id += 1
                        
        print("Extracted IDs from ", entry[0])

Extracted IDs from  training_0
Extracted IDs from  training_1
Extracted IDs from  training_2
Extracted IDs from  training_3
Extracted IDs from  training_4


In [13]:
refreshed_ids = []
for entry in updated:
    new_data = {}
    for key, value in entry[1].items():
        # We grab the new ID for tracks: All tracks from test exist in train.
        new_tracks = [reordered_tracks[track] for track in value["track_idxs"]]
        new_tags = []
        for tags in value["tags_idxs"]:
            new_tags.append([tag if tag == 0 else reordered_tags[tag] 
                             for tag in tags])

        new_skips = []
        for skip in value["skips"]:
            if skip > 0.9:
                new_skips.append(1)
            else:
                new_skips.append(2)

        new_data[int(key)] = {"track_idxs": new_tracks,
                              "tags_idxs": new_tags,
                              "skips": new_skips}
    
    with open("sessions_30music_{}.json".format(entry[0]), "w") as out:
        json.dump(new_data, out)
        print("Refreshed for ", entry[0])
    
    refreshed_ids.append((entry[0], new_data))

Refreshed for  training_0
Refreshed for  training_1
Refreshed for  training_2
Refreshed for  training_3
Refreshed for  training_4
Refreshed for  testing_0
Refreshed for  testing_1
Refreshed for  testing_2
Refreshed for  testing_3
Refreshed for  testing_4


In [16]:
for version, values in refreshed_ids:
    all_tags = set()
    for key, entry in values.items():
        for tags in entry["tags_idxs"]:
            for tag in tags:
                all_tags.add(tag)

    print(version, len(all_tags))

training_0 54170
testing_0 54170
training_1 52441
testing_1 52441
training_2 50504
testing_2 50504
training_3 51153
testing_3 51153
training_4 52616
testing_4 52616


In [17]:
for version, values in refreshed_ids:
    all_tracks = set()
    for key, entry in values.items():
        for track in entry["track_idxs"]:
            all_tracks.add(track)

    print(version, len(all_tracks))


training_0 216054
testing_0 216054
training_1 210736
testing_1 210736
training_2 200469
testing_2 200469
training_3 207245
testing_3 207245
training_4 218665
testing_4 218665


In [32]:
prepared_training = []
prepared_testing = []
for path in Path("data").rglob("user_ids_*.json"):
    new_format = {}
    with open(path, "r") as source:
        data = json.load(source)
    
    for key, values in data.items():
        new_format[key] = {"session_subset": values}
    
    with open(path, "w") as out:
        json.dump(new_format, out)