In [None]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.externals import joblib

from youtube_audioset import get_data, get_recursive_sound_names, get_all_sound_names
from youtube_audioset import explosion_sounds, motor_sounds, wood_sounds, human_sounds, nature_sounds

In [None]:
ambient_sounds, impact_sounds = get_all_sound_names()

explosion_sounds = get_recursive_sound_names(explosion_sounds)
motor_sounds = get_recursive_sound_names(motor_sounds)
wood_sounds = get_recursive_sound_names(wood_sounds)
human_sounds = get_recursive_sound_names(human_sounds)
nature_sounds = get_recursive_sound_names(nature_sounds)

df, labels_binarized = get_data()

In [None]:
print df.shape[0], "examples"

print "Percentage Impact Sounds:", (labels_binarized[impact_sounds].sum(axis=1) > 0).mean()
print "Percentage Ambient Sounds:", (labels_binarized[ambient_sounds].sum(axis=1) > 0).mean()

labels_binarized.mean()

In [None]:
# df_filtered = df.loc[df.features.apply(lambda x: x.shape[0] == 10)]
df_filtered = df.loc[df['labels'].apply(lambda x: (len(x) == 1)) & df.features.apply(lambda x: x.shape[0] == 10)]
labels_filtered = labels_binarized.loc[df_filtered.index,:]

X_total = np.array(df_filtered.features.apply(lambda x: x.flatten()).tolist())
all_silence_ind = labels_filtered.Silence.values

In [None]:
pca_ = PCA(random_state=42)
x_pca_ = pca_.fit_transform(X_total[all_silence_ind == 1,:].reshape((-1,128)))

kmeans_ = KMeans(n_clusters=3, random_state=43)
kgroup_ = kmeans_.fit_predict(X_total[all_silence_ind == 1,:].reshape((-1,128)))

plt.clf()
plt.figure(figsize=(20,20))

plt.scatter(x_pca_[:,0], x_pca_[:,1],
            c=kgroup_,
            s=100, edgecolors='none')

plt.show()

Let's take the centroid of the first group and use that to identify an embedding for silence.  We don't look at the other groups because they may have sounds the we don't have labels for.

In [None]:
silence_embedding = X_total[all_silence_ind == 1,:].reshape((-1,128))[kgroup_ == 0,:].mean(axis=0)

silence_embedding.round()

We'll use the euclidean distance from this embedding to determine silence.

In [None]:
distance_from_silence = np.sqrt(np.square(X_total[all_silence_ind == 1,:].reshape((-1,128))[kgroup_ == 0,:] - \
                                  silence_embedding).sum(axis=1))

distance_from_silence

In [None]:
plt.hist(distance_from_silence)
plt.show()

We'll need to zoom in on the histogram to find a good threshold.

In [None]:
plt.hist(distance_from_silence[distance_from_silence < 100], bins=20)
plt.show()

20 seems like a good distance.

In [None]:
silence_embedding_euclidean_distance_threshold = 20

In [None]:
cosine_distance_from_silence = cdist(X_total[all_silence_ind == 1,:].reshape((-1,128))[kgroup_ == 0,:],
                                     silence_embedding.reshape(1,-1), 'cosine').reshape(-1)

cosine_distance_from_silence

In [None]:
plt.hist(cosine_distance_from_silence[cosine_distance_from_silence < 0.02], bins = 20)
plt.show()

0.01 seems like a generous cosine distance.

In [None]:
silence_embedding_cosine_distance_threshold = 0.01

Let's save this embedding value.

In [None]:
if not os.path.exists('parameter'):
    os.makedirs('parameter')

joblib.dump(silence_embedding, "parameter/silence_embedding.pkl")
joblib.dump(silence_embedding_cosine_distance_threshold, "parameter/silence_embedding_cosine_distance_threshold.pkl")
joblib.dump(silence_embedding_euclidean_distance_threshold, "parameter/silence_embedding_euclidean_distance_threshold.pkl")