In [1]:
import sys, os, re, gzip, json, pickle, shutil, random

from pydub.utils import mediainfo

In [2]:
data_path = '../data'
#mp3s_path = '%s/mp3s' % data_path
mp3_path = 'D:/mp3_dot_com'
mp3s_path = '%s/mp3' % mp3_path

local_mp3s_path = '%s/mp3s' % data_path

#song_data_file = '%s/mp3com_html_analysis_output.txt' % data_path
song_data_file_path = '%s/mp3com_html_analysis_output.txt' % mp3_path

#create this cleaned metadata file
metadata_path = '%s/metadata.json.gz' % data_path

genre_counts_path = '%s/genre_cts.pkl' % data_path

#for mapping 1000+ unnormalized genres to a few genres
genre_map_path = '%s/genre_map.txt' % data_path

data={}
genre_map={}

In [3]:
%config IPCompleter.greedy=True
%config Completer.use_jedi = False

In [3]:
#
# Parse song file - extract and clean select fields
#

In [4]:
def parse_metadata_file(song_file_path, metadata_path, genre_counts_path):
    #Headers and record example:
    #File Name	Artist Name	Song Name	Genre	Comment	CD	Label	Credits	Download Link	Mp3 Filename
    #.\artist_song\0\249.html	 Beefchow	Keep on Dancin (original)	Club	Dance (Pretty upbeat and catchy)	Tekknotrancemissions		DJ Beefchow	http://play.mp3.com/cgi-bin/play/play.cgi/AAIAQvkAAADABG5vcm1QGAAAAFL5AAAAUQEAAABDwienPtGPy4rQmW9N6pQj_gKeqQs-/Keep_on_Dancin_origina.mp3	Keep_on_Dancin_origina.mp3
    data = {}
    genre_cts = {}
    
    #some mp3 filenames are duplicated, the actual filenames have (2) etc after them, but we have to 
    #match these through their metadata and update the records
    data_to_disambiguate = {} #mp3_filename: [recs with this name]
    dups = 0
    idx=0
    
    fields = ['mp3_filename', 'genre', 'artist_name', 'album_name', 'song_name', 'comment']
    field_cts = {f:0 for f in fields}
    
    mp3_filenames = set()
    
    with open(song_file_path, encoding='ISO-8859-1') as f:
        headers = f.readline().replace('\n','')
        
        #clean up headers
        headers = headers.replace('CD', 'album_name')
        headers = [h.lower().replace(' ','_') for h in headers.split('\t')]
        
        #select only the headers we're interested in
        col_idx = [headers.index(field) for field in fields]
        
        headers = [headers[i] for i in col_idx]
        
        #lines.append('ID\t%s' % '\t'.join(headers))
        print(headers, col_idx)

        for i,line in enumerate(f):
            items = line.replace('\n','').split('\t')

            if not len(items)==10:
                continue
                
            #no mp3 name
            if not items[-1]:
                continue

            rec = {f:'' for f in fields if not f=='mp3_filename'}
            idx+=1
            rec['ID'] = idx

            items = [items[j].strip() for j in col_idx]

            #'mp3_filename', 'genre', 'artist_name', 'album_name', 'song_name', 'comment'
            mp3_filename = ''  #use as key for deduping
            if items[0]:
                field_cts['mp3_filename']+=1
                mp3_filename = items[0]
            if items[1]:
                field_cts['genre']+=1
                genre = format_text(items[1])
                if genre not in genre_cts:
                    genre_cts[genre]=0
                genre_cts[genre]+=1
                rec['genre'] = genre
            if items[2]:
                field_cts['artist_name']+=1
                rec['artist_name'] = format_text(items[2])
            if items[3]:
                field_cts['album_name']+=1
                rec['album_name'] = items[3]
            if items[4]:
                field_cts['song_name']+=1
                rec['song_name'] = items[4]
            if items[5]:
                field_cts['comment']+=1
                rec['comment'] = items[5]

            #a duplicate filename, store in separate dict for disambiguation
            if mp3_filename in mp3_filenames:
                if mp3_filename not in data_to_disambiguate:
                    data_to_disambiguate[mp3_filename] = []
                
                #if original already stored in data, remove and store in data_to_disambiguate
                if mp3_filename in data:
                    dup_rec = data[mp3_filename]
                    data_to_disambiguate[mp3_filename].append(dup_rec)
                    del data[mp3_filename]
                    dups+=1
                
                data_to_disambiguate[mp3_filename].append(rec)
                dups+=1
            else:
                data[mp3_filename] = rec
                mp3_filenames.add(mp3_filename)
                
            if i and i%25000==0:
                print(i, dups, rec)
        
    print('Storing genre counts')
    with gzip.open(genre_counts_path, 'wb') as oz:
        pickle.dump(genre_cts, oz)
        
    print('Storing %d unambiguous song records' % len(data))
    with gzip.open(metadata_path, 'wt', encoding='utf-8') as oz:
        json.dump(data, oz)
        
    print('\n\nTOTAL: %d\nDUP FILENAMES: %d\nCOUNTS: %s\nPERCENT FIELD COVERAGE: %s' % (
        idx, dups, field_cts, [(k,v/idx) for k,v in field_cts.items()]))
    
    return data, data_to_disambiguate, genre_cts
            
            
def format_text(text):
    return text.strip().lower().replace(' ', '_').replace('&amp;', '&')


def save_metadata(data, metadata_path):
    with gzip.open(metadata_path, 'wt', encoding='utf-8') as oz:
        json.dump(data, oz)
        
        
def load_metadata(metadata_path):
    with gzip.open(metadata_path, 'rt', encoding='utf-8') as fz:
        data = json.load(fz)
    print('loaded metadata for %d records' % len(data))
    return data

In [5]:
if not os.path.exists(metadata_path):
    data, data_to_disambiguate, genre_cts = parse_metadata_file(song_data_file_path, metadata_path)
    print(len(data), len(data_to_disambiguate), sum([len(v) for v in data_to_disambiguate.values()]))
    #397801 40334 174445
else:
    data = load_metadata(metadata_path)
    with gzip.open(genre_counts_path, 'rb') as fz:
        genre_cts = pickle.load(fz)
    print(len(genre_cts))

loaded metadata for 445385 records


In [6]:
len(genre_cts)

366

In [92]:
def disambiguate_dup_filenames(data, data_to_disambiguate, mp3s_path, metadata_path):
    found = []
    missing=[]
    no_metadata = 0
    for i, fn in enumerate(data_to_disambiguate.keys()):
        if i%1000==0:
            print('%d - found: %d, missing: %d, no meta: %d, recs: %d' % (
                i, len(found), len(missing), no_metadata, len(data)))
            with gzip.open(metadata_path, 'wt', encoding='utf-8') as oz:
                json.dump(data, oz)
            
        #shouldn't happen
        if fn in data:
            continue
            
        fp = '%s/%s' % (mp3s_path, fn)
        if not os.path.exists(fp):
            missing.append(fn)
            continue
            
        dup_recs = data_to_disambiguate[fn]
        
        #metadata = {'encoder': 'LAME3.92 ', 'title': 'Believe', 'artist': 'DREAMTRONIX', 
        #            'comment': 'http://www.mp3.com/DREAMTRONIX','genre': 'Blues'}
        metadata = mediainfo(fp).get('TAG', None)
        if metadata==None:
            no_metadata+=1
            continue
            
        artist = format_text(metadata['artist']) if 'artist' in metadata else ''
        genre = format_text(metadata['genre']) if 'genre' in metadata else ''
        
        for dup_rec in dup_recs:
            if dup_rec['artist_name']==artist:
                data[fn] = dup_rec
                found.append(fn)
                break
                
    with gzip.open(metadata_path, 'wt', encoding='utf-8') as oz:
        json.dump(data, oz)
                
    print('Found: %d\tMissing: %d\tNo Metadata: %d\tTotal Recs: %d' % (len(found), len(missing), no_metadata, len(data)))
    
    return data, found, missing

In [93]:
# Try deduping ambiguous records by extracting metadata from unmatched files
if not os.path.exists(metadata_path):
    data, found, missing = disambiguate_dup_filenames(data, data_to_disambiguate, mp3s_path, metadata_path)
    len(data) #453830

0 - found: 0, missing: 0, no meta: 0, recs: 440115
1000 - found: 0, missing: 44, no meta: 44, recs: 440115
2000 - found: 0, missing: 99, no meta: 84, recs: 440115
3000 - found: 0, missing: 169, no meta: 120, recs: 440115
4000 - found: 343, missing: 241, no meta: 162, recs: 440458
5000 - found: 880, missing: 312, no meta: 193, recs: 440995
6000 - found: 1380, missing: 403, no meta: 221, recs: 441495
7000 - found: 1890, missing: 480, no meta: 257, recs: 442005
8000 - found: 2381, missing: 590, no meta: 290, recs: 442496
9000 - found: 2849, missing: 724, no meta: 315, recs: 442964
10000 - found: 3329, missing: 842, no meta: 345, recs: 443444
11000 - found: 3792, missing: 965, no meta: 366, recs: 443907
12000 - found: 4193, missing: 1108, no meta: 388, recs: 444308
13000 - found: 4621, missing: 1246, no meta: 407, recs: 444736
14000 - found: 5031, missing: 1394, no meta: 435, recs: 445146
15000 - found: 5426, missing: 1548, no meta: 453, recs: 445541
16000 - found: 5828, missing: 1703, no 

In [213]:
def disambiguate_numbered_files(data, data_to_disambiguate, mp3s_path, metadata_path):
    no_metadata = 0
    found = []
    checked=0
    
    #find files with (number) at the end, remove the number and try to match in dup data
    for fn in os.listdir(mp3s_path):
        if not fn.endswith('mp3'):
            continue
            
        #shouldn't happen
        if fn in data:
            continue
        
        if not re.match('.+ \([0-9]+\)\.mp3', fn):
            continue
            
        file_name = re.sub(' \([0-9]+\)\.mp3', '.mp3', fn)
        
        if not file_name in data_to_disambiguate:
            continue
        
        checked+=1
        
        if checked%1000==0:
            print(len(found), no_metadata)
            
        fp = '%s/%s' % (mp3s_path, fn)
        
        metadata = mediainfo(fp).get('TAG', None)
        if metadata==None:
            no_metadata+=1
            continue
            
        artist = format_text(metadata['artist']) if 'artist' in metadata else ''
        
        dup_recs = data_to_disambiguate[file_name]
        
        for dup_rec in dup_recs:
            if dup_rec['artist_name']==artist:
                data[fn] = dup_rec
                found.append(fn)
                
                #this is slow so store periodically
                if len(found)%1000==0:
                    print('Storing %d records' % len(data))
                    with gzip.open(metadata_path, 'wt', encoding='utf-8') as oz:
                        json.dump(data, oz)
                break
                
    with gzip.open(metadata_path, 'wt', encoding='utf-8') as oz:
        json.dump(data, oz)    
        
    print('Found: %d\tNo metadata: %d\tTotal Recs: %d' % (len(found), no_metadata, len(data)))
    
    return data, found

In [214]:
#if there were duplicates of a song name then the mp3 had (number) added to the end
#locate these and update the filename in the data dict

if not os.path.exists(metadata_path):
    data, found = disambiguate_numbered_files(data, data_to_disambiguate, mp3s_path, metadata_path)
    len(data) #445385

917 13
Storing 398801 records
1827 29
Storing 399801 records
2757 40
Storing 400801 records
3671 53
Storing 401801 records
4584 67
Storing 402801 records
5506 79
Storing 403801 records
6415 93
Storing 404801 records
7302 114
Storing 405801 records
8228 125
Storing 406801 records
9155 137
Storing 407801 records
10086 147
10993 170
Storing 408801 records
11929 177
Storing 409801 records
12848 198
Storing 410801 records
13773 215
Storing 411801 records
14683 233
Storing 412801 records
15599 249
Storing 413801 records
16524 265
Storing 414801 records
17431 283
Storing 415801 records
18333 304
Storing 416801 records
19249 317
Storing 417801 records
20168 330
Storing 418801 records
21089 344
21988 363
Storing 419801 records
22904 381
Storing 420801 records
23829 393
Storing 421801 records
24757 408
Storing 422801 records
25660 421
Storing 423801 records
26584 436
Storing 424801 records
27499 455
Storing 425801 records
28426 469
Storing 426801 records
29352 477
Storing 427801 records
30265 49

In [27]:
def add_size_to_data(data, mp3s_path, metadata_path):
    ttl = 0
    not_found = []
    over_5mb = 0
    
    for i, fn in enumerate(data.keys()):
        if i and i%10000==0:
            print('Records checked so far: %d\tRecs found: %d\t> 5 Mb: %d\tNot Found: %d, including %s' % (
                i, ttl, over_5mb, len(not_found), not_found[-1]))
            with gzip.open(metadata_path, 'wt', encoding='utf-8') as oz:
                json.dump(data, oz)
                
        fp = '%s/%s' % (mp3s_path, fn)
        if not os.path.exists(fp):
            not_found.append(fn)
            continue
            
        ttl+=1
            
        data[fn]['size_mb'] = os.path.getsize(fp)/1024**2
        if data[fn]['size_mb'] > 5:
            over_5mb+=1
            
    return data

In [28]:
data = add_size_to_data(data, mp3s_path, metadata_path)
#224430 or so found

Records checked so far: 10000	Recs found: 4631	> 5 Mb: 743	Not Found: 5369, including Representando.mp3
Records checked so far: 20000	Recs found: 9254	> 5 Mb: 1476	Not Found: 10746, including Van_y_vienen.mp3
Records checked so far: 30000	Recs found: 13862	> 5 Mb: 2293	Not Found: 16138, including Stay_dot_calm.mp3
Records checked so far: 40000	Recs found: 18431	> 5 Mb: 3085	Not Found: 21569, including How_It_Used_To_Be.mp3
Records checked so far: 50000	Recs found: 22951	> 5 Mb: 3820	Not Found: 27049, including Peacefull_Summit.mp3
Records checked so far: 60000	Recs found: 27471	> 5 Mb: 4539	Not Found: 32529, including Resa_Tunnel_Vision_Rem.mp3
Records checked so far: 70000	Recs found: 32024	> 5 Mb: 5247	Not Found: 37976, including Dont_put_a_Spell_on_Me.mp3
Records checked so far: 80000	Recs found: 36557	> 5 Mb: 6037	Not Found: 43443, including Southern_Rag_Medley_II.mp3
Records checked so far: 90000	Recs found: 41062	> 5 Mb: 6750	Not Found: 48938, including El_Negrito_Del_Batey.mp3
R

In [None]:
#
# Clean up some obviously wrong genres
#
# Map all 1000+ genres to normalized set of genres.
# Extract metadata from files and store this info also in metadata.json. Provides artist name and genre.
# If both genres are the same, then there is more evidence that the assignment is correct.
# Maybe look for other sources of evidence, music lists that include genre.
#

In [19]:
def read_genre_map(genre_map_path):
    genre_map ={}
    with open(genre_map_path, 'r') as f:
        for line in f:
            if not line:
                continue
            g,gm = line.replace('\n','').split('\t')
            genre_map[g]=gm
            
    return genre_map


def write_genre_map(genre_map, genre_map_path):
    with open(genre_map_path, 'w') as o:
        for g in sorted(genre_map, key=genre_map.get):
            o.write('%s\t%s\n' % (g, genre_map[g]))

In [10]:
if os.path.exists(genre_map_path):
    genre_map = read_genre_map(genre_map_path)

In [97]:
#print counts for each genre from metadata file

if not genre_map:
    for g in sorted(genre_cts, key=genre_cts.get, reverse=True):
        print('%s\t\t%d' % (g, genre_cts[g]))
    
#which genres are likely to be the most distinct? 
#we can do statistical analysis after feature extraction is done to find the most distinctive classes

rock		44037
alternative_general		31863
pop		17196
punk		16677
indie		14862
electronica		11363
experimental		11299
hip_hop		11049
experimental/post_rock		11035
acoustic		10133
techno		9926
ambient		9293
trance		7785
alternative_metal		7574
heavy_metal		7531
emo		7028
beats		6787
pop_punk		6742
folk		6532
indie_pop/lo_fi		6531
metalcore		6242
industrial_electronic		6241
dance		6206
drum_n'_bass		5973
power_pop		5740
hardcore_punk		5393
aaa/adult_alternative		5020
melodic_trance		4950
breakbeat/breaks		4362
rap		4323
guitar_rock		4264
new_age		4035
instrumental_rock		3889
general_comedy		3796
death_metal		3733
house		3498
alternative_hip_hop		3378
folk_rock		3227
blues_rock		3226
down_tempo		3156
mood_music		3149
film_music		3135
progressive_rock		3092
jazz_fusion		3062
ska		3044
new_country		2933
country_general		2910
piano		2868
progressive_trance		2862
love_songs		2857
general_jazz		2843
world_fusion		2838
noise		2812
alternative_country		2673
psychedelic		2668
groove		2650
industrial	

In [16]:
def map_genres(genre_cts, min_examples=1000):
    genre_map = {}
    
    #map subtypes to types with substrings
    substrs = ['blues', 'punk', 'pop', 'rock', 'jazz', 'country', 'metal', 'techno',
               'reggae', 'easy_listening', 'dance', 'house', 'trance', 'industrial', 'rap',  
               'hip_hop', 'ambient', 'electronic', 'classical', 'children']
    for g in genre_cts:
        for substr in substrs:
            if substr in g and g not in genre_map:
                print('%s -> %s' % (g, substr))
                genre_map[g] = substr
            
    #various other mappings
    
    map_to = 'metal'
    for g in genre_cts:
        if 'core' in g and g not in genre_map:
            print('%s -> %s' % (g, map_to))
            genre_map[g] = map_to
            
    map_to = 'bluegrass'
    for g in genre_cts:
        if 'grass' in g and g not in genre_map:
            print('%s -> %s' % (g, map_to))
            genre_map[g] = map_to
            
            
    #direct mappings        
    map_to = 'rock'
    for g in ['indie', 'alternative_general', 'grunge', 'psychedelic', 'acid', 'ska', 'garage', 'glam']:
        if g not in genre_map:
            print('%s -> %s' % (g, map_to))
            genre_map[g] = map_to
    #overwrite some
    del genre_map['rock-n-roll_oldies']
    del genre_map['soft_rock']    
    del genre_map['goth_rock']
    del genre_map['rockabilly']
    
    map_to = 'oldies'
    for g in ['soft_rock', 'adult_contemporary', 'swing/big_band', 'love_songs', 'lounge', 
              'aaa/adult_alternative', 'oldies', 'standards', 'crooners/vocals', 'romantic', 
              'rockabilly', 'rock-n-roll_oldies']:
        if g not in genre_map:
            print('%s -> %s' % (g, map_to))
            genre_map[g] = map_to
    
    map_to = 'gospel'
    for g in ['gospel', 'spiritual']:
        if g not in genre_map:
            print('%s -> %s' % (g, map_to))
            genre_map[g] = map_to
            
    map_to = 'pop'
    for g in ['contemporary_urban', 'contemporary', 'crossover', 'alternative_cover_songs']:
        if g not in genre_map:
            print('%s -> %s' % (g, map_to))
            genre_map[g] = map_to
            
    map_to = 'ambient'
    for g in ['mood_music', 'new_age']:
        if g not in genre_map:
            print('%s -> %s' % (g, map_to))
            genre_map[g] = map_to
    
    map_to = 'classical'
    for g in ['choral', 'chamber_music', 'baroque', 'medieval', 'renaissance', 'symphonic', 'opera', 
              'vocal', 'ensemble', 'piano', 'guitar', 'solo_instruments', 'woodwinds', 'strings']:
        if g not in genre_map:
            print('%s -> %s' % (g, map_to))
            genre_map[g] = map_to
            
    map_to = 'jazz'
    for g in ['world_fusion', 'soul', 'detroit', 'bebop', 'mod']:
        if g not in genre_map:
            print('%s -> %s' % (g, map_to))
            genre_map[g] = map_to
            
    map_to = 'experimental'
    for g in ['minimal', 'minimalist', 'experimental', 'noise', 'abstract']:
        if g not in genre_map:
            print('%s -> %s' % (g, map_to))
            genre_map[g] = map_to
            
    map_to = 'funk'
    for g in ['funk', 'funky_breaks', 'groove']:
        if g not in genre_map:
            print('%s -> %s' % (g, map_to))
            genre_map[g] = map_to
            
    map_to = 'goth'
    for g in ['goth_rock', 'darkwave', 'emo', 'goth']:
        if g not in genre_map:
            print('%s -> %s' % (g, map_to))
            genre_map[g] = map_to
            
    map_to = 'hip_hop'
    for g in ['trip_hop', 'west_coast', 'east_coast']:
        if g not in genre_map:
            print('%s -> %s' % (g, map_to))
            genre_map[g] = map_to
            
    map_to = 'rap'
    for g in ['new_school', 'old_school']:
        if g not in genre_map:
            print('%s -> %s' % (g, map_to))
            genre_map[g] = map_to
    
    #just lumping these together
    map_to = 'world'
    for g in ['world/folk_cover_songs', 'general_latin', 'asian', 'americana', 'world_traditions', 
              'celtic', 'native_american', 'tribal', 'salsa', 'caribbean', 'african', 'brazilian', 
              'russian', 'indian', 'jewish/israeli', 'french', 'nouveau_flamenco', 'cuban', 'arabic',
              'merengue', 'tropical', 'scandinavian', 'bossa_nova', 'european', 'south/central_american', 
              'traditional', 'quebecois', 'jungle']:
        if g not in genre_map:
            print('%s -> %s' % (g, map_to))
            genre_map[g] = map_to
            
    #map types of spoken word
    map_to = 'spoken_word'
    for g in ['humor', 'self-help', 'stories_and_myths', 'audio_books', 'parodies', 'general_comedy', 
          'political_humor', 'interviews', 'satire', 'poetry', 'spoken_word', 'politics',
         'speak_your_mind', 'paranormal', 'aliens', 'bodily_functions', 'shout_outs', 'witchcraft', 
         'horror_stories', 'commercials', 'mental_health', 'spoofs', 'recorded_greetings', 
         'lies', 'movies', 'opinions', 'education', 'emergency!', 'nonfiction', 
         'hypnosis', 'advice', 'radio', 'time_capsule_recordings', 'dumb_stories', 'love']:
        if g not in genre_map:
            print('%s -> %s' % (g, map_to))
            genre_map[g] = map_to
            
            
    #add remaining if there are enough instances
    for g,c in genre_cts.items():
        if g not in genre_map and c>min_examples:
            print('%s -> %s' % (g, g))
            genre_map[g] = g
            
    return genre_map

In [17]:
if not genre_map:
    genre_map = map_genres(genre_cts, min_examples=1000)
    write_genre_map(genre_map, genre_map_path)

techno -> techno
power_metal -> metal
house -> house
rock -> rock
classical_general -> classical
industrial -> industrial
progressive_trance -> trance
electronica -> electronic
dance -> dance
punk -> punk
pop -> pop
hip_hop -> hip_hop
intelligent_techno -> techno
hard_trance -> trance
heavy_metal -> metal
jump_blues -> blues
progressive_metal -> metal
rhythm_&_blues -> blues
symphonic_electronica -> electronic
goth_rock -> rock
hard_house -> house
trance -> trance
reggae -> reggae
industrial_rock -> rock
death_metal -> metal
power_pop -> pop
general_jazz -> jazz
ambient -> ambient
guitar_rock -> rock
instrumental_rock -> rock
alternative_metal -> metal
blues_vocals -> blues
general_blues -> blues
hardcore_punk -> punk
rap -> rap
folk_punk -> punk
experimental/post_rock -> rock
black_metal -> metal
folk_rock -> rock
acoustic_blues -> blues
rockabilly -> rock
alternative_country -> country
christian_pop -> pop
country_general -> country
thrash/speed_metal -> metal
jazz_vocals -> jazz
ska

In [None]:
# add metadata from files to data dict

In [261]:
def add_extracted_metadata_and_size(data, mp3s_path, genre_map, metadata_path):
    ttl = 0
    extracted_genres = 0
    genre_match = 0
    over_5mb = 0
    
    for i, fn in enumerate(data.keys()):
        if i%10000==0:
            print('Total: %d\tExtracted genre count: %d\tMatching genres: %d\t> 5 Mb: %d' % (
                ttl, extracted_genres, genre_match, over_5mb))
            with gzip.open(metadata_path, 'wt', encoding='utf-8') as oz:
                json.dump(data, oz)
                
        fp = '%s/%s' % (mp3s_path, fn)
        if not os.path.exists(fp):
            continue
            
        ttl+=1
            
        data[fn]['mb'] = os.path.getsize(fp)/1024**2
        if data[fn]['mb'] > 5:
            over_5mb+=1
        
        #metadata = {'encoder': 'LAME3.92 ', 'title': 'Believe', 'artist': 'DREAMTRONIX', 
        #            'comment': 'http://www.mp3.com/DREAMTRONIX','genre': 'Blues'}
        metadata = mediainfo(fp).get('TAG', None)
        if metadata==None:
            continue
            
        if 'genre' in metadata:
            extracted_genres += 1
            
            meta_genre = format_text(metadata['genre'])
            data[fn]['mp3_metadata_genre'] = meta_genre
            
            orig_genre = data[fn]['genre']
            
            meta_genre = genre_map.get(meta_genre, meta_genre)
            orig_genre = genre_map.get(orig_genre, orig_genre)
            
            if meta_genre==orig_genre:
                data[fn]['genre_match'] = True
                genre_match += 1
        if 'artist' in metadata:
            data[fn]['mp3_metadata_artist'] = format_text(metadata['artist'])
        if 'encoder' in metadata:
            data[fn]['mp3_metadata_encoder'] = metadata['encoder'].lower()

    print('Total: %d\tExtracted genre count: %d\tMatching genres: %d\t> 5 Mb: %d' % (
        ttl, extracted_genres, genre_match, over_5mb))
    with gzip.open(metadata_path, 'wt', encoding='utf-8') as oz:
        json.dump(data, oz)
        
    return data

In [None]:
data = add_extracted_metadata_and_size(data, mp3s_path, genre_map, metadata_path)

Total: 0	Extracted genre count: 0	Matching genres: 0	> 5 Mb: 0
Total: 4631	Extracted genre count: 4510	Matching genres: 100	> 5 Mb: 743
Total: 9254	Extracted genre count: 9015	Matching genres: 219	> 5 Mb: 1476
Total: 13862	Extracted genre count: 13498	Matching genres: 347	> 5 Mb: 2293
Total: 18431	Extracted genre count: 17942	Matching genres: 447	> 5 Mb: 3085
Total: 22951	Extracted genre count: 22378	Matching genres: 559	> 5 Mb: 3820
Total: 27471	Extracted genre count: 26785	Matching genres: 666	> 5 Mb: 4539
Total: 32024	Extracted genre count: 31240	Matching genres: 755	> 5 Mb: 5247


In [None]:
#
# Get some samples from each genre
#

In [None]:
excluded_genres = set(['sound_effects', 'spoken_word'])

In [241]:
def get_examples(data, mp3s_path, local_mp3s_path, genre_map, excluded_genres, num_per_genre, max_mb=5):
    ttl=0
    
    genre_num = {g:0 for g in set(genre_map.values())}
    
    for genre in os.listdir(local_mp3s_path):
        genre_path = '%s/%s' % (local_mp3s_path, genre)
        for fn in os.listdir(genre_path):
            genre_num[genre] += 1
            
    fns = list(data.keys())
    random.shuffle(fns)
        
    for fn in fns:
        meta = data[fn]
        if meta['genre'] not in genre_map:
            continue
        if meta['mb'] > max_mb:
            continue
            
        genre = genre_map[meta['genre']]
        
        if genre in excluded_genres:
            continue
            
        if genre_num[genre]>=num_per_genre:
            continue
            
        genre_path = '%s/%s' % (local_mp3s_path, genre)
        if not os.path.exists(genre_path):
            print(genre)
            os.mkdir(genre_path)
            
        in_path = '%s/%s' % (mp3s_path, fn)
        if not os.path.exists(in_path):
            continue
            
        out_path = '%s/%s' % (genre_path, fn)
        #already downloaded
        if os.path.exists(out_path):
            continue
        
        genre_num[genre]+=1
        shutil.copy(in_path, out_path)
        ttl+=1
        
        if ttl%100==0:
            print(ttl)

In [242]:
num_per_genre = 100
get_examples(data, mp3s_path, local_mp3s_path, genre_map, excluded_genres, num_per_genre)

dance
rock
hip_hop
metal
pop
rap
experimental
jazz
punk
extreme_metal
blues
country
industrial
goth
soft_rock
acoustic
ambient
folk
world
childrens
soundtrack
old_rock
reggae
100
classical
instrumental
vocals
funk
200
holiday
300
spiritual
bluegrass
400
500
600
700
800
vocal
900
1000
1100
1200
1300
1400
1500
1600
1700
musicals
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
