<a href="https://colab.research.google.com/github/tuomaseerola/audio/blob/master/audio_corpus_analysis_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Music and Science – Audio Corpus Analysis Tutorial 

[Tuomas Eerola](https://www.durham.ac.uk/staff/tuomas-eerola/), Durham University, Music Department, 2022.

# 1 Set up the libraries
First activate few useful libraries (numpy, librosa, matplotlib) and then install `mirdata` library, that will take some time.

In [None]:
#import os
import numpy as np
import librosa
import librosa.display
import IPython.display as ipd
from matplotlib import pyplot as plt 
%matplotlib inline
print(librosa.__version__)

In [None]:
pip install mirdata

# 2 Obtain a dataset

Let's look at a classic genre categorization dataset by Tzanetakis & Cook (2002). The full data contains 100 audio excerpts from 10 genres, but we are going to start with a smaller set to keep this light and fast to run.

In [None]:
import mirdata
print(mirdata.list_datasets())
gtzan_genre = mirdata.initialize('gtzan_genre')

In [None]:
gtzan = mirdata.initialize('gtzan_genre', version='mini') # This is 100 excerpts
#gtzan = mirdata.initialize('gtzan_genre')                  # This is 1000 excerpts
gtzan.download()
len(gtzan.track_ids)


Let's look at an example (number 88).

In [None]:
tracks = gtzan.load_tracks()
#print(tracks.keys())
ex = tracks[gtzan.track_ids[88]]
print(["Genre:", ex.genre, "Name:", ex.track_id, "Tempo:",ex.tempo,])

plt.figure(figsize=(12, 3))
librosa.display.waveshow(ex.audio[0],ex.audio[1])
ipd.display(ipd.Audio(data=ex.audio[0], rate=ex.audio[1]))

# 3 Extract features

Let's extract some features and use them to predict genre.

In [None]:
#import numpy as np
#import librosa
#import librosa.display

import pandas as pd
import os
import csv
import sys
import natsort
import warnings
warnings.filterwarnings('ignore')

df = pd.DataFrame(columns = ['genre','bpm','rmse', 'spec_cent','spec_bw','rolloff','zcr','spec_ctr','mfcc1','mfcc2','mfcc3','mfcc4','mfcc5','mfcc6','mfcc7','mfcc8','mfcc9','mfcc10','mfcc11','mfcc12','mfcc13','mfcc14','mfcc15','mfcc16','mfcc17','mfcc18','mfcc19','chroma1','chroma2','chroma3','chroma4','chroma5','chroma6','chroma7','chroma8','chroma9','chroma10','chroma11','chroma12']) 
for track in tracks:
  print(track)
  ex = tracks[track]
  y, sr = librosa.load(ex.audio_path)
  chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
  rmse = librosa.feature.rms(y=y)
  spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
  spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
  rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
  zcr = librosa.feature.zero_crossing_rate(y)
  spec_ctr = librosa.feature.spectral_contrast(y=y, sr=sr, hop_length=512)
  chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=512)
  mfcc = librosa.feature.mfcc(y=y, sr=sr)
  df.loc[len(df)] = [ex.genre,ex.tempo,np.mean(rmse),np.mean(spec_cent),np.mean(spec_bw),np.mean(rolloff),np.mean(zcr),np.mean(spec_ctr),np.mean(mfcc[1]),np.mean(mfcc[2]),np.mean(mfcc[3]),np.mean(mfcc[4]),np.mean(mfcc[5]),np.mean(mfcc[6]),np.mean(mfcc[7]),np.mean(mfcc[8]),np.mean(mfcc[9]),np.mean(mfcc[10]),np.mean(mfcc[11]),np.mean(mfcc[12]),np.mean(mfcc[13]),np.mean(mfcc[14]),np.mean(mfcc[15]),np.mean(mfcc[16]),np.mean(mfcc[17]),np.mean(mfcc[18]),np.mean(mfcc[19]),np.mean(chroma[0]),np.mean(chroma[1]),np.mean(chroma[2]),np.mean(chroma[3]),np.mean(chroma[4]),np.mean(chroma[5]),np.mean(chroma[6]),np.mean(chroma[7]),np.mean(chroma[8]),np.mean(chroma[9]),np.mean(chroma[10]),np.mean(chroma[11])]


### 3.1 Explore features
Let's look at the features.

In [None]:
df.drop('genre', axis=1).plot(kind='box', subplots=True,figsize=(15,15), layout=(6,7), sharex=False, sharey=False)
plt.show()

# 4 Build classifiers with the features

We use

In [None]:
df.head(3)


In [None]:
from sklearn import preprocessing
import numpy as np
X = df.drop('genre', axis = 1)
df['bpm'][np.isnan(df['bpm'])]=120

X = preprocessing.normalize(X)
y = df['genre']
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
test_size = 0.30 # taking 70:30 training and test set
seed = 7  # Random numbmer seeding for reapeatability of the code
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
NN = KNeighborsClassifier()
NN.fit(X_train,y_train)


# 5 Classify (start here!)

In [None]:
df.head(3)

In [None]:
import pandas as pd
import sklearn as sk
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

X = df.drop('genre', axis = 1)
Xn = preprocessing.normalize(X)
y = df['genre']

test_size = 0.30 # taking 70:30 training and test set
seed = 9  # Random numbmer seeding for reapeatability of the code
X_train, X_test, y_train, y_test = train_test_split(Xn, y, test_size=test_size, random_state=seed)

RF = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0).fit(X_train, y_train)
RF.predict(X_test)
print(round(RF.score(X_test, y_test), 4))
y_pred_test = RF.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred_test)
print(conf_mat)

Random forest correct classification is 0.65 (with the full data). Let's look at this model in model detail, which features are doing the most heavy lifting here.



In [None]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

importance = RF.feature_importances_
n = df.columns[1:len(df.columns)]
im = pd.DataFrame({'data': importance,'names': n})
im2 = im.sort_values(by='data',ascending=False)
print(im2.tail(5))
# plot feature importance
fig, ax = plt.subplots(figsize=(10, 5))
#figure(figsize=(10, 5))
plt.scatter(im2.names[0:9],im2.data[0:9],color='red')
plt.plot(im2.names[0:9],im2.data[0:9])
ax.set_title('10 strongest features')
plt.show()

Take the five best features and try building a simpler model?

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

X2 = df.filter(['rmse','spec_bw', 'mfcc3', 'bpm', 'spec_ctr'])

test_size = 0.30 # taking 70:30 training and test set
seed = 9  # Random numbmer seeding for reapeatability of the code
X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=test_size, random_state=seed,stratify=y)

RF = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0).fit(X_train, y_train)
RF.predict(X_test)
# Make predictions for the test set
y_pred_test = RF.predict(X_test)
print(round(RF.score(X_test, y_test), 4))

conf_mat = confusion_matrix(y_test, y_pred_test,labels=RF.classes_)
print(conf_mat)


In [None]:
#print(y_test.array)
print(np.unique(y_test))

In [None]:
import numpy as np
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(12,12))
cmp = ConfusionMatrixDisplay.from_estimator(RF,X_test,y_test,normalize='true')
cmp.plot(ax=ax)


Visualise confusion matrix

In [None]:
import seaborn as sns

# Reshape
matrix = confusion_matrix(y_test, y_pred_test)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

# Build the plot
plt.figure(figsize=(16,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Blues, linewidths=0.2)

# Add labels to the plot
class_names = RF.classes_ #np.unique(y_test)
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks+0.5, class_names, rotation=90)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Random Forest Model')
plt.show()


# References
- Tzanetakis, G. & Cook, P. (2002). Musical genre classification of audio signals. _IEEE Transactions on Speech and Audio Processing, 10(5)_, 293-302 [doi:10.1109/TSA.2002.800560](https://ieeexplore.ieee.org/document/1021072).