In [55]:
import pandas as pd
import numpy as np
import os
from fma_code import utils
import filtering_utils

pd.options.display.max_rows = None
pd.options.display.max_columns = None
pd.options.display.max_colwidth = None

%matplotlib inline

In [56]:
TRACKS_PATH = os.path.join(filtering_utils.DS_PATH, "tracks.csv")
GENRES_PATH = os.path.join(filtering_utils.DS_PATH, 'genres.csv')

In [57]:
genres = utils.load(GENRES_PATH)
genres

Unnamed: 0_level_0,#tracks,parent,title,top_level
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,8693,38,Avant-Garde,38
2,5271,0,International,2
3,1752,0,Blues,3
4,4126,0,Jazz,4
5,4106,0,Classical,5
6,914,38,Novelty,38
7,217,20,Comedy,20
8,868,0,Old-Time / Historic,8
9,1987,0,Country,9
10,13845,0,Pop,10


In [58]:
genres[['title', '#tracks']].sort_values(by='#tracks', ascending=False)

Unnamed: 0_level_0,title,#tracks
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1
38,Experimental,38154
15,Electronic,34413
12,Rock,32923
1235,Instrumental,14938
10,Pop,13845
17,Folk,12706
25,Punk,9261
1,Avant-Garde,8693
21,Hip-Hop,8389
32,Noise,7268


In [59]:
genres.columns

Index(['#tracks', 'parent', 'title', 'top_level'], dtype='object')

In [60]:
min_occurrences = 5500

genre_ids_with_occ_to_keep = {}

for genre_id, (occurrences, parent_id, title, top_level) in genres.iterrows():
    if occurrences >= min_occurrences:
        genre_ids_with_occ_to_keep[genre_id] = occurrences
        continue
    parent_occurrences = 0
    while parent_id != 0 and parent_occurrences < min_occurrences:
        parent_genre = genres.loc[parent_id]
        parent_occurrences = parent_genre['#tracks']
        parent_occurrences += occurrences
        parent_id = parent_genre['parent']
    if parent_id != 0:
        genre_ids_with_occ_to_keep[parent_id] = genre_ids_with_occ_to_keep.get(parent_id, 0) + occurrences

len(genre_ids_with_occ_to_keep), genre_ids_with_occ_to_keep

(16,
 {1: 8693,
  10: 13845,
  12: 38541,
  15: 34413,
  17: 12706,
  18: 5913,
  21: 8389,
  25: 9261,
  27: 6041,
  32: 7268,
  38: 38154,
  41: 6110,
  42: 5723,
  76: 7144,
  107: 7206,
  1235: 14938})

In [61]:
tracks = utils.load(TRACKS_PATH)
tracks.head(3)

Unnamed: 0_level_0,album,album,album,album,album,album,album,album,album,album,album,album,album,artist,artist,artist,artist,artist,artist,artist,artist,artist,artist,artist,artist,artist,artist,artist,artist,artist,set,set,track,track,track,track,track,track,track,track,track,track,track,track,track,track,track,track,track,track,track,track
Unnamed: 0_level_1,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,tags,title,tracks,type,active_year_begin,active_year_end,associated_labels,bio,comments,date_created,favorites,id,latitude,location,longitude,members,name,related_projects,tags,website,wikipedia_page,split,subset,bit_rate,comments,composer,date_created,date_recorded,duration,favorites,genre_top,genres,genres_all,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2
2,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],AWOL - A Way Of Life,7,Album,2006-01-01,NaT,,"<p>A Way Of Life, A Collective of Hip-Hop from NJ...................</p>",0,2008-11-26 01:42:32,9,1,40.058324,New Jersey,-74.405661,"Sajje Morocco,Brownbum,ZawidaGod,Custodian of Records,Zooberelli the Don,F.A.H,MadSicka,Damien Omenicci..and a van load more...",AWOL,The list of past projects is 2 long but every1 and every style from Tabby Bonet 2 M.O.P..Azillion Records Flagship trackmaster DJ BrownBum is a beat Wizard.....A-2-Z..illion....(right now working with JerseyBlock Ent),[awol],http://www.AzillionRecords.blogspot.com,,training,small,256000,0,,2008-11-26 01:48:12,2008-11-26,168,2,Hip-Hop,[21],[21],,4656,en,Attribution-NonCommercial-ShareAlike 3.0 International,1293,,3,,[],Food
3,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],AWOL - A Way Of Life,7,Album,2006-01-01,NaT,,"<p>A Way Of Life, A Collective of Hip-Hop from NJ...................</p>",0,2008-11-26 01:42:32,9,1,40.058324,New Jersey,-74.405661,"Sajje Morocco,Brownbum,ZawidaGod,Custodian of Records,Zooberelli the Don,F.A.H,MadSicka,Damien Omenicci..and a van load more...",AWOL,The list of past projects is 2 long but every1 and every style from Tabby Bonet 2 M.O.P..Azillion Records Flagship trackmaster DJ BrownBum is a beat Wizard.....A-2-Z..illion....(right now working with JerseyBlock Ent),[awol],http://www.AzillionRecords.blogspot.com,,training,medium,256000,0,,2008-11-26 01:48:14,2008-11-26,237,1,Hip-Hop,[21],[21],,1470,en,Attribution-NonCommercial-ShareAlike 3.0 International,514,,4,,[],Electric Ave
5,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],AWOL - A Way Of Life,7,Album,2006-01-01,NaT,,"<p>A Way Of Life, A Collective of Hip-Hop from NJ...................</p>",0,2008-11-26 01:42:32,9,1,40.058324,New Jersey,-74.405661,"Sajje Morocco,Brownbum,ZawidaGod,Custodian of Records,Zooberelli the Don,F.A.H,MadSicka,Damien Omenicci..and a van load more...",AWOL,The list of past projects is 2 long but every1 and every style from Tabby Bonet 2 M.O.P..Azillion Records Flagship trackmaster DJ BrownBum is a beat Wizard.....A-2-Z..illion....(right now working with JerseyBlock Ent),[awol],http://www.AzillionRecords.blogspot.com,,training,small,256000,0,,2008-11-26 01:48:20,2008-11-26,206,6,Hip-Hop,[21],[21],,1933,en,Attribution-NonCommercial-ShareAlike 3.0 International,1151,,6,,[],This World


In [62]:
tracks = tracks[['set', 'track']]
tracks.head(3)

Unnamed: 0_level_0,set,set,track,track,track,track,track,track,track,track,track,track,track,track,track,track,track,track,track,track,track,track
Unnamed: 0_level_1,split,subset,bit_rate,comments,composer,date_created,date_recorded,duration,favorites,genre_top,genres,genres_all,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2,training,small,256000,0,,2008-11-26 01:48:12,2008-11-26,168,2,Hip-Hop,[21],[21],,4656,en,Attribution-NonCommercial-ShareAlike 3.0 International,1293,,3,,[],Food
3,training,medium,256000,0,,2008-11-26 01:48:14,2008-11-26,237,1,Hip-Hop,[21],[21],,1470,en,Attribution-NonCommercial-ShareAlike 3.0 International,514,,4,,[],Electric Ave
5,training,small,256000,0,,2008-11-26 01:48:20,2008-11-26,206,6,Hip-Hop,[21],[21],,1933,en,Attribution-NonCommercial-ShareAlike 3.0 International,1151,,6,,[],This World


In [63]:
tracks['track'][['genre_top', 'genres_all']].head(3)

Unnamed: 0_level_0,genre_top,genres_all
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,Hip-Hop,[21]
3,Hip-Hop,[21]
5,Hip-Hop,[21]


In [64]:
tracks = pd.merge(
    tracks['set'], tracks['track'][['genre_top', 'genres_all']],
    right_index=True, left_index=True)
tracks.sample(5)

Unnamed: 0_level_0,split,subset,genre_top,genres_all
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
88920,training,medium,Rock,"[25, 12]"
7387,test,large,Experimental,"[38, 22]"
1135,validation,large,Rock,[12]
42970,training,large,Experimental,"[32, 38]"
134990,validation,large,Experimental,"[32, 38, 47]"


In [65]:
tracks['genre_top'] = tracks['genre_top'].astype(str)
tracks['genre_top'].fillna('', inplace=True)
tracks.sample(10)

Unnamed: 0_level_0,split,subset,genre_top,genres_all
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
80244,training,large,,"[58, 3, 12, 567]"
135408,training,large,,"[5, 42, 15, 495, 18, 1235]"
70816,training,large,Rock,[12]
126484,training,medium,Folk,[17]
27789,training,large,Rock,"[25, 26, 12, 109]"
48350,training,large,,"[66, 4, 12, 38]"
36085,training,medium,Rock,"[58, 12]"
41942,training,large,,"[21, 15]"
125465,training,large,,"[32, 25, 12, 38]"
40894,training,large,,"[38, 42, 107, 47, 15, 1235]"


In [66]:
sorted(genre_ids_with_occ_to_keep.items(), key=lambda x: x[1])

[(42, 5723),
 (18, 5913),
 (27, 6041),
 (41, 6110),
 (76, 7144),
 (107, 7206),
 (32, 7268),
 (21, 8389),
 (1, 8693),
 (25, 9261),
 (17, 12706),
 (10, 13845),
 (1235, 14938),
 (15, 34413),
 (38, 38154),
 (12, 38541)]

In [67]:
tracks_to_drop = []

for track_id, (split, subset, genre_top, genres_all) in tracks.iterrows():
    if genre_top != 'nan': continue
    track_genres_with_occ = {}
    for genre_id in genres_all:
        if genre_id not in genre_ids_with_occ_to_keep: continue
        track_genres_with_occ[genre_id] = genre_ids_with_occ_to_keep[genre_id]
    low_occ_genre = sorted(track_genres_with_occ.items(), key=lambda x: x[1])
    if not low_occ_genre:
        tracks_to_drop.append(track_id)
        continue
    tracks.loc[track_id, 'genre_top'] = genres.loc[low_occ_genre[0][0], 'title']

len(tracks_to_drop), tracks_to_drop

(2916,
 [461,
  462,
  463,
  464,
  465,
  613,
  1213,
  1216,
  1217,
  1384,
  2010,
  3276,
  3678,
  3841,
  3843,
  3844,
  3845,
  3846,
  4505,
  4532,
  4533,
  4534,
  5025,
  5081,
  5097,
  5921,
  5922,
  5923,
  5925,
  6327,
  6364,
  6375,
  6388,
  6395,
  6668,
  6794,
  7179,
  7180,
  7181,
  7182,
  7547,
  7737,
  7832,
  8520,
  8596,
  9156,
  9161,
  9181,
  9182,
  9185,
  9209,
  9223,
  9330,
  9331,
  9332,
  9334,
  9335,
  9343,
  9362,
  9379,
  9381,
  9395,
  9396,
  9399,
  9401,
  9414,
  9420,
  9524,
  9525,
  9526,
  9527,
  9529,
  9530,
  9647,
  9648,
  9649,
  9660,
  9661,
  9667,
  9889,
  9892,
  9900,
  9901,
  9903,
  9979,
  9981,
  9983,
  9984,
  10183,
  10185,
  10212,
  10345,
  10355,
  10370,
  10399,
  10403,
  10404,
  10417,
  10419,
  10421,
  10423,
  10424,
  10425,
  10429,
  10431,
  10461,
  10501,
  10503,
  10504,
  10505,
  10506,
  10507,
  10508,
  10509,
  10510,
  10511,
  10512,
  10513,
  10515,
  10516,
  10517

In [68]:
tracks.drop(tracks_to_drop, axis=0, inplace=True)

In [69]:
tracks[tracks['genre_top'] == 'nan']

Unnamed: 0_level_0,split,subset,genre_top,genres_all
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [70]:
filtering_utils.draw_pie(tracks, 'genre_top')

25 values left


In [71]:
genre_names_to_keep = [genres.loc[genre_id, 'title'] for genre_id in genre_ids_with_occ_to_keep]
genre_names_to_keep

['Avant-Garde',
 'Pop',
 'Rock',
 'Electronic',
 'Folk',
 'Soundtrack',
 'Hip-Hop',
 'Punk',
 'Lo-Fi',
 'Noise',
 'Experimental',
 'Electroacoustic',
 'Ambient Electronic',
 'Experimental Pop',
 'Ambient',
 'Instrumental']

In [72]:
tracks_to_drop.clear()

for track_id, genre in tracks['genre_top'].items():
    if genre not in genre_names_to_keep:
        tracks_to_drop.append(track_id)

len(tracks_to_drop), tracks_to_drop

(4670,
 [144,
  145,
  146,
  147,
  237,
  238,
  590,
  591,
  592,
  593,
  640,
  666,
  667,
  704,
  705,
  706,
  707,
  708,
  709,
  734,
  735,
  736,
  737,
  738,
  739,
  740,
  741,
  742,
  743,
  744,
  745,
  747,
  748,
  749,
  750,
  751,
  752,
  753,
  754,
  755,
  756,
  757,
  758,
  759,
  760,
  761,
  762,
  763,
  764,
  765,
  766,
  767,
  768,
  769,
  770,
  771,
  772,
  773,
  774,
  775,
  776,
  831,
  832,
  833,
  834,
  835,
  853,
  904,
  905,
  906,
  907,
  908,
  909,
  910,
  911,
  912,
  913,
  914,
  915,
  916,
  917,
  918,
  919,
  920,
  921,
  922,
  923,
  924,
  925,
  926,
  927,
  928,
  929,
  930,
  931,
  932,
  933,
  934,
  935,
  936,
  937,
  938,
  939,
  940,
  941,
  942,
  943,
  944,
  945,
  946,
  947,
  948,
  949,
  950,
  951,
  952,
  953,
  954,
  955,
  956,
  957,
  958,
  959,
  960,
  961,
  962,
  963,
  964,
  965,
  966,
  967,
  968,
  969,
  970,
  971,
  972,
  973,
  974,
  975,
  976,
  977,
  1020

In [73]:
tracks.drop(tracks_to_drop, axis=0, inplace=True)

In [74]:
tracks.head(10)

Unnamed: 0_level_0,split,subset,genre_top,genres_all
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,training,small,Hip-Hop,[21]
3,training,medium,Hip-Hop,[21]
5,training,small,Hip-Hop,[21]
10,training,small,Pop,[10]
20,training,large,Experimental Pop,"[17, 10, 76, 103]"
26,training,large,Experimental Pop,"[17, 10, 76, 103]"
30,training,large,Experimental Pop,"[17, 10, 76, 103]"
46,training,large,Experimental Pop,"[17, 10, 76, 103]"
48,training,large,Experimental Pop,"[17, 10, 76, 103]"
134,training,medium,Hip-Hop,[21]


In [75]:
tracks.rename({'genre_top': 'genre'}, axis=1, inplace=True)
tracks.drop('genres_all', axis=1, inplace=True)
tracks.sample()

Unnamed: 0_level_0,split,subset,genre
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
63354,training,medium,Electronic


In [76]:
filtering_utils.draw_pie(tracks, 'genre')

16 values left


In [77]:
"""
maybe without experimental:
    'Avant-Garde', 'Novelty', 'Sound Effects', 'Audio Collage',
    'Field Recordings', 'Noise', 'Experimental', 'Electroacoustic',
    'Drone', 'Unclassifiable', 'Sound Poetry', 'Sound Collage',
    'Musique Concrete', 'Improv', 'Kid-Friendly', 'Minimalism',
    'Sound Art', 'Holiday', 'Christmas'
"""
tracks.to_csv(os.path.join(filtering_utils.DS_PATH, 'tracks_filtered.csv'))

In [78]:
genres['#tracks'].sum()
# more tracks for genres than tracks in tracks.csv

349160

In [79]:
genre_dummies = pd.get_dummies(tracks['genre'])
genre_dummies.head()

Unnamed: 0_level_0,Ambient,Ambient Electronic,Avant-Garde,Electroacoustic,Electronic,Experimental,Experimental Pop,Folk,Hip-Hop,Instrumental,Lo-Fi,Noise,Pop,Punk,Rock,Soundtrack
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
20,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [80]:
tracks_genre_dummies = pd.merge(
    tracks.drop('genre', axis=1), genre_dummies,
    right_index=True, left_index=True)
tracks_genre_dummies.sample()

Unnamed: 0_level_0,split,subset,Ambient,Ambient Electronic,Avant-Garde,Electroacoustic,Electronic,Experimental,Experimental Pop,Folk,Hip-Hop,Instrumental,Lo-Fi,Noise,Pop,Punk,Rock,Soundtrack
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
134965,training,large,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [81]:
print(tracks_genre_dummies.shape)
tracks_genre_dummies = tracks_genre_dummies[(tracks_genre_dummies.T != 0).any()]
tracks_genre_dummies.shape

(98988, 18)


(98988, 18)

In [82]:
tracks_genre_dummies.to_csv(os.path.join(filtering_utils.DS_PATH, 'tracks_filtered_dummies.csv'))