In [1]:
import csv
import json
import pandas as pd
from pathlib import Path

## Preparation
We start by loading our generated .txt files.

In [2]:
raw_training = []
raw_testing = []
for path in Path("data").rglob("*.txt"):
    if "train" in str(path): 
        raw_training.append(path) 
    else:
        raw_testing.append(path)

In [3]:
raw_training

[PosixPath('data/30music-200ks_train_full.4.txt'),
 PosixPath('data/30music-200ks_train_full.3.txt'),
 PosixPath('data/30music-200ks_train_full.2.txt'),
 PosixPath('data/30music-200ks_train_full.0.txt'),
 PosixPath('data/30music-200ks_train_full.1.txt')]

In [4]:
raw_testing

[PosixPath('data/30music-200ks_test.4.txt'),
 PosixPath('data/30music-200ks_test.3.txt'),
 PosixPath('data/30music-200ks_test.2.txt'),
 PosixPath('data/30music-200ks_test.0.txt'),
 PosixPath('data/30music-200ks_test.1.txt')]

We then proceed to loading the raw .idomaar files.

This way, we can match the tracks and grab tags for later.


In [6]:
all_tracks = {}
#with open("H:\\ThirtyMusic\\entities\\tracks.idomaar", "r", encoding="utf-8") as source:
with open("/Users/jo/Documents/ThirtyMusic/entities/tracks.idomaar", "r", encoding="utf-8") as source:
    for i, line in enumerate(source):
        _, track_id, _, response, extended = line.split("\t")
        response = json.loads(response)
        extended = json.loads(extended)
        try:
            tag_ids = [tag["id"] for tag in extended["tags"]]
        except TypeError:
            tag_ids = []
        all_tracks[track_id] = {"track_id": track_id,
                                "duration": response["duration"],
                                "artist_id": extended["artists"][0]["id"],
                                "playcount": response["playcount"],
                                "tag_ids": tag_ids
                               }

In [7]:
all_tags = []
#with open("H:\\ThirtyMusic\\entities\\tags.idomaar", "r", encoding="utf-8") as source:
with open("/Users/jo/Documents/ThirtyMusic/entities/tags.idomaar", "r", encoding="utf-8") as source:
    for i, line in enumerate(source):
        all_tags.append(line.split("/t")[1])

print(len(set(all_tags)))

256278


In [9]:
tracks = []
for track, data in all_tracks.items():
    tracks.append(int(data["track_id"]))

print(len(set(tracks)))

4519105


We then proceed to check number of users in the respective training and test files.

In [8]:
for i in range(5):
    print("Testing for", i)
    with open(raw_training[i], "r") as source:
        train_lines = source.readlines()[1:]
    with open(raw_testing[i], "r") as source:
        test_lines = source.readlines()[1:]

    # Extract all train items.
    train_users = set()
    train_items = set()
    train_artists = set()
    for line in train_lines:
        user_id, session_id, item_id, timestamp, artist_id = line.split("\t")
        train_users.add(user_id)
        train_items.add(item_id)
        train_artists.add(artist_id)

    # Extract all test items.
    test_users = set()
    test_items = set()
    test_artists = set()
    for line in test_lines:
        user_id, session_id, item_id, timestamp, artist_id = line.split("\t")
        test_users.add(user_id)
        test_items.add(item_id)
        test_artists.add(artist_id)
    
    print("Users")
    print(len(train_users))
    print(len(test_users))
    print("Intersection: {} ({})".format(len(train_users.intersection(test_users)), 
                                         100 * (len(train_users.intersection(test_users)) / len(test_users))))
    print("Items")
    print(len(train_items))
    print(len(test_items))
    print("Intersection: {} ({})".format(len(train_items.intersection(test_items)), 
                                         100 * (len(train_items.intersection(test_items)) / len(test_items))))
    print("Artists")
    print(len(train_artists))
    print(len(test_artists))
    print("Intersection: {} ({})".format(len(train_artists.intersection(test_artists)), 
                                         100 * (len(train_artists.intersection(test_artists)) / len(test_artists))))

    print("\n")

Testing for 0
Users
9580
932
Intersection: 578 (62.01716738197425)
Items
216054
16305
Intersection: 16305 (100.0)
Artists
31366
4116
Intersection: 4116 (100.0)


Testing for 1
Users
9337
862
Intersection: 535 (62.064965197215784)
Items
210736
14026
Intersection: 14026 (100.0)
Artists
30947
3586
Intersection: 3586 (100.0)


Testing for 2
Users
9025
852
Intersection: 516 (60.56338028169014)
Items
200469
14472
Intersection: 14472 (100.0)
Artists
30353
3743
Intersection: 3743 (100.0)


Testing for 3
Users
9334
973
Intersection: 582 (59.815005138746145)
Items
207245
15146
Intersection: 15146 (100.0)
Artists
31529
3991
Intersection: 3991 (100.0)


Testing for 4
Users
11115
1544
Intersection: 889 (57.57772020725389)
Items
218665
20318
Intersection: 20318 (100.0)
Artists
33063
5040
Intersection: 5040 (100.0)




In other words: Whilst not all test users are in the training sets (about 60% are), all songs and artists are. 

In [9]:
options = [(raw_training, "training"), (raw_testing, "testing")]

for files, name in options:
    for index, split in enumerate(files):
        sessions = {}
        with open(split, "r") as source:
            lines = source.readlines()[1:]
            for line in lines:
                user_id, session_id, item_id, timestamp, artist_id = line.split("\t")
                track_tags = all_tracks[item_id]["tag_ids"]
                track_tags += [0] * (5 - len(track_tags))
                if session_id in sessions:
                    sessions[session_id]["track_idxs"].append(item_id)
                    sessions[session_id]["tags_idxs"].append(track_tags)
                else:
                    sessions[session_id] = {"track_idxs": [item_id],
                                            "tags_idxs": [track_tags],
                                            "user_id": user_id}
                    
        with open("data/{}_sessions_{}.json".format(name, index), "w") as out:
            json.dump(sessions, out)
        
        print("Dumped {} of {}".format(index, name))

Dumped 0 of training
Dumped 1 of training
Dumped 2 of training
Dumped 3 of training
Dumped 4 of training
Dumped 0 of testing
Dumped 1 of testing
Dumped 2 of testing
Dumped 3 of testing
Dumped 4 of testing


In [10]:
for i in range(5):
    with open("data/training_sessions_{}.json".format(i), "r") as source:
        processed_training = json.load(source)

    with open("data/testing_sessions_{}.json".format(i), "r") as source:
        processed_testing = json.load(source)

    processed_training_users = set()
    for _, value in processed_training.items():
        processed_training_users.add(value["user_id"])

    processed_testing_users = set()
    for _, value in processed_testing.items():
        processed_testing_users.add(value["user_id"])

    print("Parsed ", i)
    print(len(processed_training_users))
    print(len(processed_testing_users))
    print("Intersection: {} ({})".format(len(processed_training_users.intersection(processed_testing_users)), 
                                            100 * (len(processed_training_users.intersection(processed_testing_users)) 
                                                / len(processed_testing_users))))

Parsed  0
9580
932
Intersection: 578 (62.01716738197425)
Parsed  1
9337
862
Intersection: 535 (62.064965197215784)
Parsed  2
9025
852
Intersection: 516 (60.56338028169014)
Parsed  3
9334
973
Intersection: 582 (59.815005138746145)
Parsed  4
11115
1544
Intersection: 889 (57.57772020725389)


At this point, the %s match. We assume everything is in order.

## Conversion

We proceed to convert to the format expected by STABR.

In [13]:
prepared_training = []
prepared_testing = []
for path in Path("data").rglob("*.json"):
    if "updated" in str(path):
        continue
    if "train" in str(path): 
        prepared_training.append(path) 
    else:
        prepared_testing.append(path)

In [14]:
prepared_training

[PosixPath('data/training_sessions_2.json'),
 PosixPath('data/training_sessions_3.json'),
 PosixPath('data/training_sessions_4.json'),
 PosixPath('data/training_sessions_0.json'),
 PosixPath('data/training_sessions_1.json')]

In [15]:
prepared_testing

[PosixPath('data/testing_sessions_4.json'),
 PosixPath('data/testing_sessions_2.json'),
 PosixPath('data/testing_sessions_3.json'),
 PosixPath('data/testing_sessions_0.json'),
 PosixPath('data/testing_sessions_1.json')]

In [16]:
with open(prepared_testing[0], "r") as source:
    data = json.load(source)

print(data)

, 205245, 11242], [190991, 11056, 72354, 18645, 112911], [190991, 11242, 115355, 11056, 18645], [190991, 11056, 103394, 72354, 190986], [190991, 11056, 205245, 115355, 18645], [190991, 205245, 18645, 11242, 115355], [190991, 205245, 115355, 14177, 72354], [190991, 249028, 72354, 11242, 205245], [115684, 111866, 115355, 27136, 1831], [191251, 70618, 116047, 1831, 444], [115355, 115752, 11056, 159268, 205245], [194413, 194264, 0, 0, 0], [198998, 107401, 11234, 11154, 259294], [154646, 251199, 0, 0, 0], [444, 0, 0, 0, 0], [190255, 144283, 108691, 205245, 83064], [218701, 115532, 84597, 21937, 170061]], 'user_id': '30447'}, '2724133': {'track_idxs': ['1513374', '235939', '2072901', '1802726', '1041125'], 'tags_idxs': [[189631, 35060, 44217, 235545, 235543], [144192, 189631, 151816, 81211, 103281], [80314, 223168, 188123, 1877, 265130], [189631, 26715, 81223, 89467, 1854], [226723, 145110, 156167, 144192, 444]], 'user_id': '30562'}, '2725822': {'track_idxs': ['2380514', '2320227', '2320010'

In [14]:
reordered_tracks = {}
reordered_tracks_id = 1
reordered_tags = {}
reordered_tags_id = 1
reordered_sessions = {}
reordered_sessions_id = 1
# We only read in for training as the training data holds
# 100% of the tracks and tags the testing data does. 
for entry in prepared_training:
    with open(entry, "r") as source:
        data = json.load(source)
        for key, value in data.items():
            # Add key to reordered sessions.
            if key not in reordered_sessions:
                reordered_sessions[key] = reordered_sessions_id
                reordered_sessions_id += 1

            # Iterate all tracks associated with current session.
            for track in value["track_idxs"]:
                if track not in reordered_tracks:
                    reordered_tracks[track] = reordered_tracks_id
                    reordered_tracks_id += 1
                    
            # Iterate all tags associated with current session.
            for tags in value["tags_idxs"]:
                for tag in tags:
                    # We wish to skip 0 as this is our padding.
                    if tag not in reordered_tags and tag != 0:
                        reordered_tags[tag] = reordered_tags_id
                        reordered_tags_id += 1

In [16]:
prepared_options = [(prepared_training, "training"), 
                    (prepared_testing, "testing")]

for files, name in prepared_options:
    for index, entry in enumerate(files):
        with open(entry, "r") as source:
            data = json.load(source)
            new_data = {}
            for key, value in data.items():
                # We grab the new ID for tracks: All tracks from test exist in train.
                new_tracks = [reordered_tracks[track] for track in value["track_idxs"]]
                new_tags = []
                for tags in value["tags_idxs"]:
                    new_tags.append([tag if tag == 0 else reordered_tags[tag] 
                                     for tag in tags])
                
                new_data[int(key)] = {"track_idxs": new_tracks,
                                      "tags_idxs": new_tags,
                                      "user_id": value["user_id"]}
            
            with open("w_updated_{}_sessions_{}.json".format(name, index), "w") as out:
                json.dump(new_data, out)

## Verification

We proced to verify that we have attained the correct splits post reodering and processing.

In [18]:
for i in range(5):
    with open("data/updated_training_sessions_{}.json".format(i), "r") as source:
        processed_training = json.load(source)

    with open("data/updated_testing_sessions_{}.json".format(i), "r") as source:
        processed_testing = json.load(source)

    processed_training_users = set()
    for _, value in processed_training.items():
        processed_training_users.add(value["user_id"])

    processed_testing_users = set()
    for _, value in processed_testing.items():
        processed_testing_users.add(value["user_id"])

    print("Parsed ", i)
    print(len(processed_training_users))
    print(len(processed_testing_users))
    print("Intersection: {} ({})".format(len(processed_training_users.intersection(processed_testing_users)), 
                                            100 * (len(processed_training_users.intersection(processed_testing_users)) 
                                                / len(processed_testing_users))))

Parsed  0
9580
932
Intersection: 578 (62.01716738197425)
Parsed  1
9337
862
Intersection: 535 (62.064965197215784)
Parsed  2
9025
852
Intersection: 516 (60.56338028169014)
Parsed  3
9334
973
Intersection: 582 (59.815005138746145)
Parsed  4
11115
1544
Intersection: 889 (57.57772020725389)


## Analysis

We need the total number of tags and songs for the training.

In [30]:
updated_training = []
updated_testing = []
for path in Path("data").rglob("*.json"):
    if "updated_train" in str(path): 
        updated_training.append(path)
    elif "updated_test" in str(path):
        updated_testing.append(path)

In [32]:
all_tags = []
for entry in updated_testing:
    with open(entry, "r") as source:
        data = json.load(source)
        for key, value in data.items():
            for tags in value["tags_idxs"]:
                for tag in tags:
                    all_tags.append(tag)
                    
for entry in updated_training:
    with open(entry, "r") as source:
        data = json.load(source)
        for key, value in data.items():
            for tags in value["tags_idxs"]:
                for tag in tags:
                    all_tags.append(tag)

print(len(set(all_tags)))

83128
