In [1]:
import numpy as np
import pandas as pd
import librosa
import os
from tqdm import tqdm_notebook
import sys

In [18]:
# Directory containing raw audio files
AUDIO_DIR = '/Volumes/thesis/audio/'
# Directory to write features to
MFCC_WRITE_DIR = 'data/features/mfcc/'
MEL_WRITE_DIR = 'data/features/mel_spec/'

In [2]:
# Load AllMusic data
artists = pd.read_csv('data/allmusic/artists_cleaned.csv')

In [3]:
artists

Unnamed: 0,name,url,active_period,genres,styles,id,main_genre,indegree,outdegree
0,Björk,https://www.allmusic.com/artist/bj%C3%B6rk-mn0...,1970s - 2010s,Pop/Rock|Electronic,Alternative/Indie Rock|Experimental Rock|Alter...,769444,Pop/Rock,26,84
1,Brigitte Fontaine,https://www.allmusic.com/artist/brigitte-fonta...,1970s - 2000s,Pop/Rock|International,Experimental|French Pop|French|Western Europea...,936658,Pop/Rock,5,5
2,Kate Bush,https://www.allmusic.com/artist/kate-bush-mn00...,1970s - 2010s,Pop/Rock,Art Rock|Alternative/Indie Rock|College Rock|A...,855423,Pop/Rock,11,65
3,Nina Hagen,https://www.allmusic.com/artist/nina-hagen-mn0...,1970s - 2010s,Pop/Rock,Alternative Pop/Rock|Alternative/Indie Rock|Ne...,414016,Pop/Rock,9,12
4,Zeena Parkins,https://www.allmusic.com/artist/zeena-parkins-...,1980s - 2010s,Avant-Garde|Jazz,Free Improvisation|Modern Composition,598083,Avant-Garde,10,8
5,David Bowie,https://www.allmusic.com/artist/david-bowie-mn...,1960s - 2010s,Pop/Rock,Album Rock|Art Rock|Contemporary Pop/Rock|Expe...,531986,Pop/Rock,32,358
6,Diamanda Galás,https://www.allmusic.com/artist/diamanda-gal%C...,1980s - 2010s,Avant-Garde|Classical|Pop/Rock,Avant-Garde Music|Experimental|No Wave|Punk/Ne...,253098,Avant-Garde,18,7
7,Lene Lovich,https://www.allmusic.com/artist/lene-lovich-mn...,1970s - 2010s,Pop/Rock,New Wave|Punk/New Wave|Alternative/Indie Rock|...,816012,Pop/Rock,6,9
8,Meredith Monk,https://www.allmusic.com/artist/meredith-monk-...,1960s - 2010s,Avant-Garde|Classical,Avant-Garde Music|Vocal Music|Opera|Modern Com...,406733,Avant-Garde,5,17
9,Peter Gabriel,https://www.allmusic.com/artist/peter-gabriel-...,1970s - 2010s,Pop/Rock|International,Album Rock|Art Rock|Contemporary Pop/Rock|Prog...,842802,Pop/Rock,24,58


# Create Mel-frequency cepstral coefficient (MFCC) representations of first track for each artist we have audio for

In [3]:
# key is artist id, value is a MFCC representation of first track for artist
mfcc_dict = {}

In [4]:
for artist in tqdm_notebook(os.listdir(AUDIO_DIR)):
    first_track = None
    
    for track in os.listdir(AUDIO_DIR + artist):
        # Find the first track (zero-indexed)
        if track.startswith('0'):
            first_track = track
            break
    
    # Create MFCC representation of track
    if first_track is not None:
        try:
            y, sr = librosa.load(AUDIO_DIR + '{}/{}'.format(artist, first_track))
            mfcc_dict[artist] = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)    
        except Exception as e:
            print e

A Jupyter Widget











In [96]:
# Compute maximum dimensions for mfcc feature matrix
shapes = []

for i, item in mfcc_dict.items():
    shapes.append(item.shape)

max_dim = np.max(shapes, axis=0)

In [107]:
for i, item in mfcc_dict.items():
    # Zero pad so that all matrices are the same size
    padded = np.zeros(max_dim)
    padded[:,:mfcc_dict[i].shape[1]] = mfcc_dict[i]
    mfcc_dict[i] = padded

In [98]:
# Write mfcc arrays to files
for i, mfcc in tqdm_notebook(mfcc_dict.items()):
    np.save(MFCC_WRITE_DIR + '{}.npy'.format(i), mfcc)

A Jupyter Widget




# Create Mel Spectrogram representations for each first track we have audio for

In [21]:
for artist in tqdm_notebook(os.listdir(AUDIO_DIR)):
    first_track = None
    
    for track in os.listdir(AUDIO_DIR + artist):
        # Find the first track (zero-indexed)
        if track.startswith('0'):
            first_track = track
            break
    
    # Create mel representation of track
    if first_track is not None:
        try:
            y, sr = librosa.load(AUDIO_DIR + '{}/{}'.format(artist, first_track))
            mel_spec = librosa.feature.melspectrogram(y, sr=sr, n_mels=128)
            # Add zero padding
            padded = np.zeros((128, 1298))
            padded[:,:mel_spec.shape[1]] = mel_spec
            np.save(MEL_WRITE_DIR + '{}.npy'.format(artist), padded)
        except Exception as e:
            print e

A Jupyter Widget








