# Clustering Segments
---
Use feature table of segments to derive an alphabet for Pempheris Adspersa vocalisations.
Unsupervised clustering algorithm is fit to a chosen set of features and their corresponding cluster number is predicted. 

## Imports:

In [313]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import librosa
import datetime

from sklearn import cluster
from sklearn.metrics import silhouette_samples

## Control Parameters:

In [303]:
sr = 16000 # sample rate

input_data_path = 'data/segmented/segment_features.csv'
segmented_data_dir = 'data/segmented/tank/'
results_data_dir = 'results/'

n_clusters = 5 # number of clusters to fit

# cluster sample image params
sample_size = 30
n_cols = 3
n_rows = 10

## Helper Functions:

In [191]:
'''
Description:
    Plots a signal waveform. 
Parameters:
    signal (numpy.ndarray): array of signal amplitudes
    sr (int): sample rate of waveform
Return:
    None
'''
def plot_signal(signal, sr=sr):
    no_samples = len(signal)
    plt.plot(np.linspace(0, no_samples/sr, no_samples), signal)

## Load Feature Dataset:

In [248]:
df_main = pd.read_csv(input_data_path)
df_features = df_main.drop(columns=["file_id", "file_len_s"])

## Chosing featues to fit:

In [285]:
df_input = df_main[
    df_main.columns[df_main.columns.str.contains("mfcc|lpcc")]
]

df_input = df_input - df_input.mean() #removing DC components from mfcc and lpcc coefficients

df_input.head(2)

Unnamed: 0,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,lpcc_0,lpcc_1,lpcc_2,lpcc_3,lpcc_4,lpcc_5,lpcc_6,lpcc_7,lpcc_8,lpcc_9
0,-16.283259,17.546125,10.315575,-0.175181,-6.578718,-6.318516,-3.300116,-0.943687,1.604125,1.668166,-0.547104,-0.213493,0.08077,0.091835,0.058671,0.034455,-0.008611,-0.024951,-0.067845,0.048056
1,30.036041,-3.169516,11.620743,8.690845,4.258244,-0.466739,-4.841083,-6.389014,-7.642615,-4.13423,1.181273,-0.00764,-0.14504,0.017353,0.027965,0.072432,0.044001,0.050283,0.011413,-0.05464


## Fitting Dataset and Predicting Clusters:

In [405]:
kmc = cluster.AgglomerativeClustering(n_clusters=n_clusters)
# kmc = cluster.KMeans(n_clusters=n_clusters, init='k-means++')
clusters = kmc.fit_predict(df_input)

## Cluster Analysis:

In [406]:
df_main["cluster"] = clusters
df_main["cluster"].value_counts()

0    19952
2     8786
1     5628
3     5107
4     4731
Name: cluster, dtype: int64

In [423]:
sample_silhouette_values = silhouette_samples(df_input, clusters)
df_main["silhouette_score"] = sample_silhouette_values

for n in range(n_clusters):
    print(f"Cluster {n} Silhouette Score:", df_main["silhouette_score"].loc[df_main["cluster"] == n].mean())
print("Average Silhouette Score:", df_main["silhouette_score"].mean())

Cluster 0 Silhouette Score: 0.29928682445745386
Cluster 1 Silhouette Score: 0.2903306096232635
Cluster 2 Silhouette Score: 0.19111824670870642
Cluster 3 Silhouette Score: 0.5371726797090879
Cluster 4 Silhouette Score: 0.4816056676372954
Average Silhouette Score: 0.32364341681711833


## Saving primary clustered dataset and sample images from each cluster:

In [None]:
os.makedirs(results_data_dir, exist_ok=True)

for cluster_n in range(n_clusters):
    cluster_samples = [] 
    # getting a batch of samples from each cluster to inspect cluster quality
    for f in df_main.loc[df_main.cluster == cluster_n].sample(sample_size)["file_id"]:
        sample, _ = librosa.load(os.path.join(segmented_data_dir, f), sr=sr)
        cluster_samples.append(sample)

    plt.figure(figsize=(5*n_cols, 2*n_rows))
    for i in range(len(cluster_samples)):
        plt.subplot(n_rows, n_cols, i+1)
        plot_signal(cluster_samples[i])

    plt.savefig(os.path.join(results_data_dir, f"cluster_{cluster_n}.png"), 
            facecolor='white', bbox_inches='tight')

In [424]:
df_main.to_csv(os.path.join(results_data_dir, 'model_out.csv'), index=False) # saving final dataset with predicted clusters.