In [5]:
import sqlite3

In [6]:
conn = sqlite3.connect('lastfm_tags.db')

In [7]:
print("We get all tags and the number of tracks they're applied to")
sql = "SELECT tid_tag.tag, tags.tag, COUNT(tid_tag.tid) FROM tid_tag, tags WHERE tid_tag.tag=tags.ROWID GROUP BY tags.tag"
res = conn.execute(sql)
tag_list = res.fetchall()
tag_list = sorted(tag_list, key=lambda x: x[2], reverse=True)
print('after sorting...')
for k in range(10):
    print(tag_list[k])
print('...')
print('(total number of unique tags: %d)' % len(tag_list))

We get all tags and the number of tracks they're applied to
after sorting...
(95, 'rock', 101071)
(5, 'pop', 69159)
(96, 'alternative', 55777)
(38, 'indie', 48175)
(138, 'electronic', 46270)
(70, 'female vocalists', 42565)
(98, 'favorites', 39921)
(238, 'Love', 34901)
(86, 'dance', 33618)
(322, '00s', 31432)
...
(total number of unique tags: 522366)


In [8]:
top_100_tag_list = list(map(lambda tag: str(tag[0]), tag_list[:100]))
print(','.join(top_100_tag_list))

95,5,96,38,138,70,98,238,86,322,648,347,34,269,914,40,324,196,1,158,97,380,481,75,379,230,45,41,764,10,201,730,492,3,916,15,356,39,382,16,150,37,1177,42,6,69,527,147,545,151,58,595,136,92,2,734,148,55,36,733,189,239,544,2182,889,197,365,1125,43,1650,47,460,686,1033,195,2139,891,923,46,500,367,561,203,83,483,611,1832,149,264,464,945,237,462,2218,1719,308,1537,765,649,617


In [39]:
print('We get all tracks with tag(s) in top 100')
sql = "SELECT DISTINCT(tids.tid) FROM tid_tag, tids WHERE tid_tag.tid = tids.ROWID AND tid_tag.tag in (%s)" % (','.join(top_100_tag_list))
res = conn.execute(sql)
tracks_with_top_100_tag = res.fetchall()


We get all tracks with tag(s) in top 100


In [40]:
for k in range(10):
    print(tracks_with_top_100_tag[k])
print('...')
print('(total number of track IDs: %d)' % len(tracks_with_top_100_tag))

('TRCIJFJ128F425C189',)
('TRCMOXY12903CDFD0D',)
('TRCMZXV128E07943F4',)
('TRCDFKU128F930879F',)
('TRCQMCQ128F92D328C',)
('TRJJKGE12903CAAFB8',)
('TRJMVSL128F427EAB0',)
('TRJBCIW128F42694CA',)
('TRJDDFL128F1459B06',)
('TRJDPQC128F92F993E',)
...
(total number of track IDs: 369047)


In [41]:
tracks_with_top_100_tag = list(map(lambda t: t[0], tracks_with_top_100_tag))

In [46]:
tracks_with_top_100_tag.sort()

In [48]:
tracks_with_top_100_tag[-20:]

['TRZZZFV128F4259A2B',
 'TRZZZHD128F1475047',
 'TRZZZHL128F9329CFB',
 'TRZZZHM128F42B1DC8',
 'TRZZZJX12903CC66C0',
 'TRZZZKN128F92F81D6',
 'TRZZZMF128F424484C',
 'TRZZZMH128EF34A5F7',
 'TRZZZMY128F426D7A2',
 'TRZZZOW128F4248475',
 'TRZZZQM128F424DBA1',
 'TRZZZRJ128F42819AF',
 'TRZZZUK128F92E3C60',
 'TRZZZUM128F424188C',
 'TRZZZYG128F92EF836',
 'TRZZZYR128F92F0796',
 'TRZZZYV128F92E996D',
 'TRZZZYX128F92D32C6',
 'TRZZZZD128F4236844',
 'TRZZZZZ12903D05E3A']

In [53]:
with open('track_id_with_top_100_tag.txt', 'w') as fd:
    for track_id in tracks_with_top_100_tag:
        fd.writelines(track_id + '\n')

In [64]:
len(tag_list)

522366

In [66]:
tag_only_list = list(map(lambda t: t[1], tag_list))

In [83]:
'Awesome' in tag_only_list

True

In [15]:
tag_list[:100]

[(95, 'rock', 101071),
 (5, 'pop', 69159),
 (96, 'alternative', 55777),
 (38, 'indie', 48175),
 (138, 'electronic', 46270),
 (70, 'female vocalists', 42565),
 (98, 'favorites', 39921),
 (238, 'Love', 34901),
 (86, 'dance', 33618),
 (322, '00s', 31432),
 (648, 'alternative rock', 30334),
 (347, 'jazz', 30152),
 (34, 'beautiful', 29421),
 (269, 'singer-songwriter', 27910),
 (914, 'metal', 27430),
 (40, 'chillout', 27276),
 (324, 'male vocalists', 27269),
 (196, 'Awesome', 26248),
 (1, 'classic rock', 25771),
 (158, 'soul', 24702),
 (97, 'indie rock', 24619),
 (380, 'Mellow', 24356),
 (481, 'electronica', 24087),
 (75, '80s', 23492),
 (379, 'folk', 23492),
 (230, 'british', 23033),
 (45, '90s', 23018),
 (41, 'chill', 22746),
 (764, 'american', 22694),
 (10, 'instrumental', 21837),
 (201, 'punk', 21203),
 (730, 'oldies', 20979),
 (492, 'seen live', 20705),
 (3, 'blues', 20474),
 (916, 'hard rock', 20241),
 (15, 'cool', 19581),
 (356, 'Favorite', 18864),
 (39, 'ambient', 17982),
 (382, 'aco

In [46]:
print('We retrieve all tags for a single track')

sql = 'SELECT tags.tag FROM tids, tid_tag, tags ' + \
        'WHERE tid_tag.tid = tids.ROWID AND tags.ROWID = tid_tag.tag ' + \
        'AND tids.tid = "%s" AND tid_tag.tag in (%s)' % ('TRAAAEF128F4273421', ','.join(top_100_tag_list))
        
res = conn.execute(sql)

We retrieve all tags for a single track


In [47]:
list(map(lambda d:d[0], res.fetchall()))

['80s', 'new wave']

In [1]:
import tables
import hdf5_getters as GETTERS
import os
import string
import glob

In [2]:
def read_target_track_id_list():
    # Read the track ids with top 100 tags
    with open('new_complet_msd_ids.txt', 'r') as fd:
        data = fd.read()
    
    return data.split(',')

def put_target_track_into_buckets(target_track_id_list):
    
    buckets = {}
    
    for upper_letter in string.ascii_uppercase:
         buckets[upper_letter] = list(filter(lambda track_id: track_id[2] == upper_letter, target_track_id_list))
            
    return buckets

In [32]:
target_track_id_list = read_target_track_id_list()
print(len(target_track_id_list))

target_track_id_list = sorted(target_track_id_list)

target_tracks_buckets = put_target_track_into_buckets(target_track_id_list)
print(len(target_tracks_buckets))


71473
26


In [9]:
for idx, track_id in enumerate(target_track_id_list):
    if idx % 2000 == 0:
        print('processed %d tracks' % idx)
    
    sql = 'SELECT tags.tag FROM tids, tid_tag, tags ' + \
        'WHERE tid_tag.tid = tids.ROWID AND tags.ROWID = tid_tag.tag ' + \
        'AND tids.tid = "%s" AND tid_tag.tag in (%s)' % (track_id, ','.join(top_100_tag_list))
        
    res = conn.execute(sql)
    track_tag_list = list(map(lambda d:d[0], res.fetchall()))
    if len(track_tag_list) == 0:
        print('Warning: track %s has no tag' % (track_id))
        
    target_track_id_list[idx] = {'track_id': track_id, 'tags': track_tag_list}

processed 0 tracks
processed 2000 tracks
processed 4000 tracks
processed 6000 tracks
processed 8000 tracks
processed 10000 tracks
processed 12000 tracks
processed 14000 tracks
processed 16000 tracks
processed 18000 tracks
processed 20000 tracks
processed 22000 tracks
processed 24000 tracks
processed 26000 tracks
processed 28000 tracks
processed 30000 tracks
processed 32000 tracks
processed 34000 tracks
processed 36000 tracks
processed 38000 tracks
processed 40000 tracks
processed 42000 tracks
processed 44000 tracks
processed 46000 tracks
processed 48000 tracks
processed 50000 tracks
processed 52000 tracks
processed 54000 tracks
processed 56000 tracks
processed 58000 tracks
processed 60000 tracks
processed 62000 tracks
processed 64000 tracks
processed 66000 tracks
processed 68000 tracks
processed 70000 tracks


In [10]:
target_track_id_list[-1]

{'tags': ['pop',
  'female vocalist',
  'chillout',
  'Hip-Hop',
  'hip hop',
  '90s',
  'rnb',
  'female vocalists',
  'favorites',
  'soul',
  'Ballad',
  'Love',
  'sad',
  'USA',
  '00s',
  'Mellow',
  'pop rock',
  'romantic',
  'american',
  'female'],
 'track_id': 'TRZZZYV128F92E996D'}

In [11]:
from sklearn.preprocessing import MultiLabelBinarizer

In [12]:
track_tags_list = []

for track in target_track_id_list:
    track_tags_list.append(set(track['tags']))

In [13]:
print(len(track_tags_list))

71473


In [14]:
mlb = MultiLabelBinarizer()
track_labels = mlb.fit_transform(track_tags_list)

In [31]:
mlb.classes_.size

100

In [15]:
len(track_labels)
track_labels[:10]

array([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 1, 0, 0, 0,

In [23]:
track_labels_tags_dict = {}

for idx, track in enumerate(target_track_id_list):
    target_track_id_list[idx]['labels'] = track_labels[idx].tolist()
    
    track_info = target_track_id_list[idx]
    track_labels_tags_dict[track_info['track_id']] = \
                    dict({'tags' : track_info['tags'], \
                            'labels':track_labels[idx].tolist()})

In [25]:
len(track_labels_tags_dict)

71473

In [18]:
len(target_track_id_list)

71473

In [26]:
import json

help(json.dump)

Help on function dump in module json:

dump(obj, fp, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, cls=None, indent=None, separators=None, default=None, sort_keys=False, **kw)
    Serialize ``obj`` as a JSON formatted stream to ``fp`` (a
    ``.write()``-supporting file-like object).
    
    If ``skipkeys`` is true then ``dict`` keys that are not basic types
    (``str``, ``int``, ``float``, ``bool``, ``None``) will be skipped
    instead of raising a ``TypeError``.
    
    If ``ensure_ascii`` is false, then the strings written to ``fp`` can
    contain non-ASCII characters if they appear in strings contained in
    ``obj``. Otherwise, all such characters are escaped in JSON strings.
    
    If ``check_circular`` is false, then the circular reference check
    for container types will be skipped and a circular reference will
    result in an ``OverflowError`` (or worse).
    
    If ``allow_nan`` is false, then it will be a ``ValueError`` to
    serialize o

In [27]:
with open('track_tags_labels.json', 'w') as fd:
    json.dump(track_labels_tags_dict, fd)

In [29]:
track_labels_tags_dict

{'TRTLKZB128F92ED0C4': {'labels': [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  'tags': ['alternative', 'metal', 'hardcore', 'male vocalist']},
 'TRVBZHU128F146367C': {'labels': [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,

In [37]:
from keras import metrics, losses