In [1]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import librosa
import os
from tqdm import tqdm_notebook
import sys
from itertools import islice
from random import shuffle
from sklearn.cluster import MiniBatchKMeans
from time import time
from sklearn.externals import joblib

In [27]:
# Directory containing raw audio files
AUDIO_DIR = '/Volumes/thesis/audio/'
# Directory to write features to
MFCC_WRITE_DIR = 'data/features/mfcc/'
MEL_WRITE_DIR = 'data/features/mel_spec/'
MFCC_ALL_WRITE_DIR = 'data/features/mfcc_all_unpadded/'
BOW_WRITE_DIR = 'data/features/bow_500/'

In [19]:
# Load AllMusic data
artists = pd.read_csv('data/allmusic/artists_cleaned.csv')

# Create Mel-frequency cepstral coefficient (MFCC) representations of first track for each artist we have audio for

In [None]:
# key is artist id, value is a MFCC representation of first track for artist
mfcc_dict = {}

In [None]:
for artist in tqdm_notebook(os.listdir(AUDIO_DIR)):
    first_track = None
    
    for track in os.listdir(AUDIO_DIR + artist):
        # Find the first track (zero-indexed)
        if track.startswith('0'):
            first_track = track
            break
    
    # Create MFCC representation of track
    if first_track is not None:
        try:
            y, sr = librosa.load(AUDIO_DIR + '{}/{}'.format(artist, first_track))
            mfcc_dict[artist] = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)    
        except Exception as e:
            print e

In [None]:
# Compute maximum dimensions for mfcc feature matrix
shapes = []

for i, item in mfcc_dict.items():
    shapes.append(item.shape)

max_dim = np.max(shapes, axis=0)

for i, item in mfcc_dict.items():
    # Zero pad so that all matrices are the same size
    padded = np.zeros(max_dim)
    padded[:,:mfcc_dict[i].shape[1]] = mfcc_dict[i]
    mfcc_dict[i] = padded

In [None]:
# Write mfcc arrays to files
for i, mfcc in tqdm_notebook(mfcc_dict.items()):
    np.save(MFCC_WRITE_DIR + '{}.npy'.format(i), mfcc)

# MFCC Extraction for all tracks for all artists

In [None]:
for artist in tqdm_notebook(os.listdir(AUDIO_DIR)):
    # Create directory for each artist if it does not exist yet
    artist_mfcc_path = MFCC_ALL_WRITE_DIR + artist
    
    if not os.path.isdir(artist_mfcc_path):
        os.makedirs(artist_mfcc_path)
    
    for track in os.listdir(AUDIO_DIR + artist):
        # Create MFCC representation of track
        try:
            y, sr = librosa.load(AUDIO_DIR + '{}/{}'.format(artist, track))
            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
            np.save(artist_mfcc_path + '/{}.npy'.format(track.decode('utf-8').split('.mp3')[0].encode('utf-8')), mfcc)
        except Exception as e:
            print artist, track
            print e
            

## Calculate mean and standard deviation of each MFCC to normalize

In [None]:
frame_count = 0
mfcc_sum = np.zeros(13,)

for artist in tqdm_notebook(os.listdir(MFCC_ALL_WRITE_DIR)):
    for song in os.listdir(MFCC_ALL_WRITE_DIR + artist):
        mfcc = np.load(MFCC_ALL_WRITE_DIR + artist + '/' + song)
        mfcc_sum += mfcc.sum(axis=1)
        frame_count += mfcc.shape[1]

In [None]:
for f in os.listdir('/Volumes/thesis/features/mfcc_all_unpadded/0001174080'):
    np.load('/Volumes/thesis/features/mfcc_all_unpadded/0001174080')

In [None]:
mfcc_means = mfcc_sum / frame_count

In [None]:
np.save('kmeans_helpers/mfcc_means.npy', mfcc_means)

In [None]:
sq_dev_sum = np.zeros(13,)

for artist in tqdm_notebook(os.listdir(MFCC_ALL_WRITE_DIR)):
    for song in os.listdir(MFCC_ALL_WRITE_DIR + artist):
        mfcc = np.load(MFCC_ALL_WRITE_DIR + artist + '/' + song)
        sq_dev_sum += ((mfcc.T - mfcc_means.T).T ** 2).sum(axis=1).reshape(-1, 1)
        
mfcc_stds = np.sqrt(sq_dev_sum / frame_count)

In [None]:
np.save('kmeans_helpers/mfcc_stds.npy', mfcc_stds)

## Create codebook for normalized MFCCs via streaming kmeans clustering

In [4]:
mfcc_means = np.load('kmeans_helpers/mfcc_means.npy')
mfcc_stds = np.load('kmeans_helpers/mfcc_stds.npy')

In [5]:
paths = []

for artist in os.listdir(MFCC_ALL_WRITE_DIR):
    for song in os.listdir(MFCC_ALL_WRITE_DIR + artist):
        paths.append(MFCC_ALL_WRITE_DIR + artist + '/' + song)
        
# Shuffle path names
shuffle(paths)

In [6]:
def generate_batch(paths, batch_size=1000):
    """Given an iterable of paths to mfcc vectors, generate batches for streaming"""
    l = len(paths)
    for ndx in range(0, l, batch_size):
        yield paths[ndx:min(ndx + batch_size, l)]

In [7]:
kmeans = MiniBatchKMeans(n_clusters=500)
count = 0

for batch in generate_batch(paths, 1000):
    count += 1
    print "Fitting batch", count
    
    X = []
    
    # Read in mfcc vectors and normalize
    for path in batch:
        # Shape is (13, num_frames)
        mfcc = np.load(path)
        # Normalize by subtracting mean and dividing by std_dev
        mfcc_norm = (mfcc.T - mfcc_means) / mfcc_stds
        
        for frame in mfcc_norm:
            X.append(frame)
    
    # Update kmeans using batch
    kmeans.partial_fit(X)

Fitting batch 1
Fitting batch 2
Fitting batch 3
Fitting batch 4
Fitting batch 5
Fitting batch 6
Fitting batch 7
Fitting batch 8
Fitting batch 9
Fitting batch 10
Fitting batch 11
Fitting batch 12
Fitting batch 13
Fitting batch 14
Fitting batch 15
Fitting batch 16
Fitting batch 17
Fitting batch 18
Fitting batch 19
Fitting batch 20
Fitting batch 21
Fitting batch 22
Fitting batch 23
Fitting batch 24
Fitting batch 25
Fitting batch 26
Fitting batch 27
Fitting batch 28
Fitting batch 29
Fitting batch 30
Fitting batch 31
Fitting batch 32
Fitting batch 33
Fitting batch 34
Fitting batch 35
Fitting batch 36
Fitting batch 37
Fitting batch 38
Fitting batch 39
Fitting batch 40
Fitting batch 41
Fitting batch 42
Fitting batch 43
Fitting batch 44
Fitting batch 45
Fitting batch 46
Fitting batch 47
Fitting batch 48
Fitting batch 49
Fitting batch 50
Fitting batch 51
Fitting batch 52
Fitting batch 53
Fitting batch 54
Fitting batch 55
Fitting batch 56
Fitting batch 57
Fitting batch 58
Fitting batch 59
Fittin

In [8]:
joblib.dump(kmeans, 'kmeans_helpers/kmeans_500.pkl')

['kmeans_helpers/kmeans_500.pkl']

# Bag of Words Representation for MFCC features using Kmeans quantization

In [29]:
for artist in tqdm_notebook(os.listdir(MFCC_ALL_WRITE_DIR)):
    # Create directory for each artist if it does not exist yet
    artist_bow_path = BOW_WRITE_DIR + artist
    
    if not os.path.isdir(artist_bow_path):
        os.makedirs(artist_bow_path)
    
    for song in os.listdir(MFCC_ALL_WRITE_DIR + artist):
        try:
            X = []
            bow = [0 for _ in range(500)]

            mfcc = np.load(MFCC_ALL_WRITE_DIR + artist + '/' + song)
            # Normalize by subtracting mean and dividing by std_dev
            mfcc_norm = (mfcc.T - mfcc_means) / mfcc_stds

            for frame in mfcc_norm:
                X.append(frame)

            # Give cluster assignments for each frame
            cluster_assign = kmeans.predict(X)
            for cluster in cluster_assign:
                bow[cluster] += 1

            # Save bow feature representation
            np.save(BOW_WRITE_DIR + artist + '/{}.npy'.format(song.decode('utf-8').split('.npy')[0].encode('utf-8')), bow)
        except:
            print 'Failed'

A Jupyter Widget




# Create Mel Spectrogram representations for each first track we have audio for

In [None]:
for artist in tqdm_notebook(os.listdir(AUDIO_DIR)):
    first_track = None
    
    for track in os.listdir(AUDIO_DIR + artist):
        # Find the first track (zero-indexed)
        if track.startswith('0'):
            first_track = track
            break
    
    # Create mel representation of track
    if first_track is not None:
        try:
            y, sr = librosa.load(AUDIO_DIR + '{}/{}'.format(artist, first_track))
            mel_spec = librosa.feature.melspectrogram(y, sr=sr, n_mels=128)
            # Add zero padding
            padded = np.zeros((128, 1298))
            padded[:,:mel_spec.shape[1]] = mel_spec
            np.save(MEL_WRITE_DIR + '{}.npy'.format(artist), padded)
        except Exception as e:
            print e