In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from youtube_audioset import get_data, get_all_sound_names

In [None]:
ambient_sounds, impact_sounds = get_all_sound_names()

In [None]:
df, labels_binarized = get_data()

In [None]:
print df.shape[0], "examples"

print "Percentage Impact Sounds:", (labels_binarized[impact_sounds].sum(axis=1) > 0).mean()
print "Percentage Ambient Sounds:", (labels_binarized[ambient_sounds].sum(axis=1) > 0).mean()

labels_binarized.mean()

A majority of the sounds are vehicle sounds.  The distribution of sounds is not well balanced, which can be mitigated by adding sample weight when training a classifier.

The abundance of examples should make it easier to generalize the classifier without excessive tuning.

The percentages with respect to ambient and impact sounds do not sum to 1.  This is due to clips which contain multiple sounds (e.g. a clip with a vehicle driving in the rain).

In [None]:
X = np.array(df.features.apply(lambda x: x.max(axis=0)).tolist())

centroids = []
for column in labels_binarized.columns:
    if labels_binarized[column].sum() == 0:
        centroids += [np.repeat(128, X.shape[1])]
    else:
        centroids += [X[labels_binarized[column] == 1,:].mean(axis=0)]
centroids = np.array(centroids)

In [None]:
pca_ = PCA()
x_pca_ = pca_.fit_transform(X)
centroids_pca_ = pca_.transform(centroids)

plt.figure(figsize=(20,20))

plt.scatter(centroids_pca_[:,0], centroids_pca_[:,1],
            c=map(lambda x: x in impact_sounds, labels_binarized.columns),
            s=100, edgecolors='none')
for i, txt in enumerate(labels_binarized.columns):
    plt.annotate(txt, (centroids_pca_[i,0],centroids_pca_[i,1]))

plt.show()

As expected similar sounds are grouped together.  The motor sounds lie in a seperate space.  The climate sounds also lie in their own space.  Explosion sounds are clustered together, but they are somewhat similar to wood sounds.

Even in 2-dimensional space, it looks like impact sounds can be separated from ambient sounds with a linear classifier.  Where a non-linear classifier will excel is with "Howl" and "Thunderstorm".

In [None]:
tsne_ = TSNE()
centroids_tsne_ = tsne_.fit_transform(centroids)

plt.figure(figsize=(20,20))

plt.scatter(centroids_tsne_[:,0], centroids_tsne_[:,1],
            c=map(lambda x: x in impact_sounds, labels_binarized.columns),
            s=100, edgecolors='none')
for i, txt in enumerate(labels_binarized.columns):
    plt.annotate(txt, (centroids_tsne_[i,0],centroids_tsne_[i,1]))

plt.show()

Since the points are equally spread out, this t-SNE plot doesn't say much.