In [96]:
import os
import pandas as pd
import torch
import ast

## Load FMA Metadata

In [34]:
metadata_folder = '../data/fma_metadata'
genres = pd.read_csv(os.path.join(metadata_folder, 'genres.csv'), index_col=0)
tracks = pd.read_csv(os.path.join(metadata_folder, 'tracks.csv'), index_col=0, header=[0, 1])

## Genres

In [35]:
genres.head()

Unnamed: 0_level_0,#tracks,parent,title,top_level
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,8693,38,Avant-Garde,38
2,5271,0,International,2
3,1752,0,Blues,3
4,4126,0,Jazz,4
5,4106,0,Classical,5


In [36]:
print('{} top-level genres'.format(len(genres['top_level'].unique())))
genres.loc[genres['top_level'].unique()].sort_values('#tracks', ascending=False)

16 top-level genres


Unnamed: 0_level_0,#tracks,parent,title,top_level
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
38,38154,0,Experimental,38
15,34413,0,Electronic,15
12,32923,0,Rock,12
1235,14938,0,Instrumental,1235
10,13845,0,Pop,10
17,12706,0,Folk,17
21,8389,0,Hip-Hop,21
2,5271,0,International,2
4,4126,0,Jazz,4
5,4106,0,Classical,5


In [22]:
experimental = genres[genres['parent'] == 38]

print('{} subgenres of "experimental"'.format(len(experimental)))
experimental



14 subgenres of "experimental"


Unnamed: 0_level_0,#tracks,parent,title,top_level
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,8693,38,Avant-Garde,38
6,914,38,Novelty,38
22,774,38,Audio Collage,38
30,3237,38,Field Recordings,38
32,7268,38,Noise,38
41,6110,38,Electroacoustic,38
47,2546,38,Drone,38
125,1511,38,Unclassifiable,38
186,682,38,Sound Poetry,38
224,1916,38,Sound Collage,38


## Tracks

In [9]:
tracks.head()

Unnamed: 0_level_0,album,album,album,album,album,album,album,album,album,album,...,track,track,track,track,track,track,track,track,track,track
Unnamed: 0_level_1,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,tags,...,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,[],...,,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food
3,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,[],...,,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
5,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,[],...,,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World
10,0,2008-11-26 01:45:08,2008-02-06 00:00:00,,4,6,,47632,,[],...,,54881,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135,,1,,[],Freeway
20,0,2008-11-26 01:45:05,2009-01-06 00:00:00,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,[],...,,978,en,Attribution-NonCommercial-NoDerivatives (aka M...,361,,3,,[],Spiritual Level


In [127]:
# get the genre and listens of tracks with genre "experimental"
bad_genres = [38]
experimental_tracks = tracks.loc[tracks['track', 'genres_all'].apply(lambda x: any(val in ast.literal_eval(x) for val in bad_genres))]

# filter to the tracks with more than 1000 listens and 5 favorites
experimental_tracks = experimental_tracks.loc[(experimental_tracks['track', 'listens'] > 1000) & (experimental_tracks['track', 'favorites'] > 5)]

print('{} experimental tracks with more than 1000 listens and 5 favorites'.format(len(experimental_tracks)))
experimental_tracks['track'].head()


2237 tracks with more than 1000 listens and 5 favorites


Unnamed: 0_level_0,bit_rate,comments,composer,date_created,date_recorded,duration,favorites,genre_top,genres,genres_all,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
281,256000,0,Brian Chippendale,2008-11-26 01:51:32,2008-11-26 00:00:00,346,11,Experimental,[32],"[32, 38]",,2202,en,Attribution-Noncommercial-No Derivative Works ...,2114,,4,,[],Juggernaut
418,256000,0,,2008-11-26 01:58:52,2008-10-01 00:00:00,486,10,,"[1, 18]","[1, 18, 1235, 38]",,7458,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,3890,,1,,[],Track1
1066,192000,0,,2008-11-26 02:33:02,2005-04-15 00:00:00,234,6,Experimental,[22],"[38, 22]",,11676,en,Attribution-Noncommercial-Share Alike 3.0 Unit...,10700,,1,,[],Helicopter de Cristal
1069,192000,0,,2008-11-26 02:33:08,2005-04-15 00:00:00,110,6,Experimental,[22],"[38, 22]",,7038,en,Attribution-Noncommercial-Share Alike 3.0 Unit...,5661,,4,,[],Un Lagrima en la Discoteca
1076,192000,0,,2008-11-26 02:33:20,2008-01-01 00:00:00,165,18,,"[15, 22]","[38, 22, 15]",,17520,en,Attribution-Noncommercial-Share Alike 3.0 Unit...,14981,,11,,[],I Keep Waiting For Earthquakes


In [130]:
# Now, get the indices of the worst experimental tracks. We can pass these into the ignore argument of our preprocessor

experimental_genre = 38
experimental_tracks = tracks.loc[tracks['track', 'genres_all'].apply(lambda x: experimental_genre in ast.literal_eval(x))]
experimental_tracks = list(experimental_tracks.loc[(experimental_tracks['track', 'listens'] <= 1000) | (experimental_tracks['track', 'favorites'] <= 5)].index)
print(f'getting indices for: {len(experimental_tracks)} tracks')
print(f'{experimental_tracks[:5]} ...')

getting indices for: 35917 tracks
[137, 138, 148, 149, 150] ...


In [129]:
# new dataset makeup:
2237 / (100000 - 35917)

0.034907853877003264

After this removal process there will be about 2.2k experimental tracks left, which is 3% of the new dataset (~65k clips)

Unfiltered, there are 36k, which is about 36% of the dataset.