In [None]:
import librosa
import librosa.display
from matplotlib import pyplot as plt
import numpy as np
import os
import pandas as pd

from IPython.display import Audio
%matplotlib inline

# Intro to MIR

## Part 1: Discrete Fourier Transform and friends

In [None]:
def create_sine(frequency, a=1, dur=1, sr=22050):
    """
    creates a sine wave
    
    Args:
        frequency (int): number of cycles per second (Hz)
        a (int): amplitude of signal
        dur (int): duration of signal in seconds
        sr (int): samplerate of signal        
    """
    return a*np.sin(np.arange(dur*sr) / sr * 2 * np.pi * frequency)

In [None]:
sr = 22050

In [None]:
sig = create_sine(1, dur=5)
plt.plot(sig);

In [None]:
Audio(data=sig, rate=sr)

hmmm, why isn't this working? Let's try with a different frequency...

In [None]:
sig = create_sine(440, dur=5)
plt.plot(sig);

In [None]:
Audio(data=sig, rate=sr)

In [None]:
def plot_dft(sig, win_len=1024):
    window = np.hamming(win_len)
    sig = window * sig[:win_len]
    D = np.abs(np.fft.fft(sig))[:win_len//2]
    xs = np.arange(win_len/2) * (22050/win_len)
    plt.plot(xs, D)

In [None]:
plot_dft(sig)

Cool, let's build a more complex signal!

In [None]:
sig = create_sine(440) + create_sine(1760, a=0.6)
Audio(data=sig, rate=sr)

In [None]:
plot_dft(sig)

In [None]:
f0 = 440
sig = 0
for i in [1, 2, 4, 6, 8, 10]:
    sig += create_sine(f0*i)
sig /= np.max(np.abs(sig))
plot_dft(sig)

In [None]:
Audio(data=sig, rate=sr)

ok, let's let's try some music!

## Let's look at some music

In [None]:
filename = librosa.util.example_audio_file()
y, sr = librosa.core.load(filename)

In [None]:
Audio(data=y, rate=sr)

In [None]:
plt.plot(y[:sr]);

In [None]:
librosa.display.waveplot(y, sr);

In [None]:
plot_dft(y[sr*5:])

In [None]:
def plot_stft(sig):
    D = librosa.amplitude_to_db(np.abs(librosa.stft(sig)), ref=np.max)
    librosa.display.specshow(D, y_axis='linear');

In [None]:
plot_stft(y[:sr*10])

In [None]:
def plot_mel_specgtrogram(y):
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
    S_dB = librosa.power_to_db(S, ref=np.max)
    librosa.display.specshow(S_dB, x_axis='time',
                             y_axis='mel', sr=sr);



In [None]:
plot_mel_specgtrogram(y[:sr*10])

phew! that's enough plotting for now, let's take a break

## Part 2: Features

Chroma features store pitch class information for each analysis frame. As you can see below, the predominant pitch in the test track is `E`, which along with prominent energy in `A` and `B` indicates this song is likely in the key of `E`.

In [None]:
chroma = librosa.feature.chroma_stft(y=y, sr=sr)
librosa.display.specshow(chroma, y_axis='chroma', x_axis='time')

In [None]:
chroma.shape

MFCCs (Mel Frequency Cepstral Coefficients) are a standard MIR feature that describe the spectral content (timbre) of audio. They are particularly useful for instrument classification.

In [None]:
mfccs = librosa.feature.mfcc(y=y, sr=sr)
librosa.display.specshow(mfccs, x_axis='time');

## Classification

Now let's use the two features described above (MFCCs and Chroma) to build a simple genre classifier.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# point this to the root of the Tzanetakis genre dataset
# http://marsyas.info/downloads/datasets.html
GTZAN_ROOT = 'genres/'

In [None]:
def extract_mfccs(y, sr):
    mfccs = librosa.feature.mfcc(y=y, sr=sr)
    return np.concatenate([
        mfccs.mean(axis=1),
        mfccs.std(axis=1)
    ])

def extract_chroma(y, sr):
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    return np.concatenate([
        chroma.mean(axis=1),
        chroma.std(axis=1)
    ])

def extract_feats(y, sr):
    chroma = extract_chroma(y, sr)
    mfccs = extract_mfccs(y, sr)
    feats = np.concatenate([chroma, mfccs])
    
    labels = []   
    for i in range(len(chroma) // 2):
        labels.append('chroma_{}_mean'.format(i))
    for i in range(len(chroma) // 2):
        labels.append('chroma_{}_std'.format(i))
    for i in range(len(mfccs) // 2):
        labels.append('mfcc_{}_mean'.format(i))
    for i in range(len(mfccs) // 2):
        labels.append('mfcc_{}_std'.format(i))
        
    return feats, labels

def feats_for_fname(fname):
    y, sr = librosa.core.load(fname)
    return extract_feats(y, sr)

def fname_generator(genre, gtzan_root=GTZAN_ROOT):
    for root, dirs, files in os.walk(gtzan_root):
        for file in files:
            if file.endswith('.wav') and genre in file:
                yield os.path.join(root, file)         
                
def df_for_genre(genre):
    fnames = []
    feats = []
    for fname in fname_generator(genre):
        # print("extracting feats for {}".format(fname))
        cur_feats, labels = feats_for_fname(fname)
        fnames.append(fname)
        feats.append(cur_feats)
    df = pd.DataFrame(feats, columns=labels, index=fnames)
    df['label'] = genre
    return df

We start by writing a few helper functions that will allow us to scan a subdirectory of our dataset, load audio files matching a specific genre, extract features from these audio files, and combine the resulting data into a Pandas DataFrame.

Below, we load features for two musical genres, `pop` and `classical`.

In [None]:
data = pd.concat([df_for_genre(genre) for genre in ['pop', 'classical']])

In [None]:
data

We will now split the data into train and test sets.

In [None]:
y = data['label']
X = data.drop('label', axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

And we build a classification pipeline using two components: a `StandardScaler` which will z-normalize our data, and a `RandomForestClassifier` using the default parameterization.

In [None]:
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier())

Finally we can train the model, predict the genres of our test set, and check our accuracy.

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

Not too bad!