In [None]:
from python_speech_features import mfcc, logfbank
from scipy.io import wavfile
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math


delta_bins = [
-7.449361934437546,
-4.959642638189663,
-3.5063742894420997,
-2.4518990367612536,
-1.6107117954938024,
-0.9037185755899212,
-0.3158629551912462,
-0.007594296904398945,
0.3127680464479745,
0.9294957576276741,
1.644535296152842,
2.487457184027898,
3.5417890204116764,
4.989806561955705,
7.457516420676731
]


interval_bins = [
1,
2,
3,
5,
7,
10,
13,
16,
20,
25,
31,
38,
48,
63,
89
]


delta_zero_bin = 8
delta_bin_count = 16
interval_bin_count = 16


def get_delta_bin(value):
    for i in range(len(delta_bins)):
        if value < delta_bins[i]:
            return i
    return len(delta_bins)


def get_interval_bin(interval_value):
    for i in reversed(range(len(interval_bins))):
        if interval_value >= interval_bins[i]:
            return i+1
    return 0


def get_interval(current_time, history_times):
    t = history_times[delta_zero_bin]
    if t == -1:
        return -1
    else:
        return current_time - t
    
    
def mels_from_tuples(tuples):
    mels = []

    # Encode all features
    for i in range(len(tuples)):
        file = 'data/FSDKaggle2018.audio_test/' + tuples[i][0]
        rate, signal = wavfile.read(file)
        mel = mfcc(signal, rate, numcep=13, nfilt=26, nfft=1103)
        mels.append(mel)
        
    return mels


def build_transition_histogram(tuples):
    hist = np.zeros((13,delta_bin_count,interval_bin_count,delta_bin_count)) # feature, last_state, interval, current_state

    mels = mels_from_tuples(tuples)

    for i in range(len(mels)): # for each sample

        mel = mels[i]

        for k in range(np.shape(mel)[1]): # for each feature

            history_times = np.full((delta_bin_count,), -1)
            t = 0
            last_delta_bin_1 = -1
            last_delta_bin_2 = -1

            last_v_1 = float('nan')

            for j in range(np.shape(mel)[0]): # for each value

                next_v = mel[j,k]
                current_v = last_v_1
                current_delta_bin = last_delta_bin_1
                last_delta_bin = last_delta_bin_2

                if not math.isnan(current_v):
                    next_delta = next_v - current_v   
                    next_delta_bin = get_delta_bin(next_delta)

                    if current_delta_bin > -1:
                        current_interval = get_interval(t, history_times)
                        current_interval_bin = get_interval_bin(current_interval)

                        ##########

                        hist[k, current_delta_bin, current_interval_bin, next_delta_bin] += 1

                        ##########

                        history_times[current_delta_bin] = t

                        last_delta_bin_2 = current_delta_bin
                        t+= 1

                    last_delta_bin_1 = next_delta_bin

                last_v_1 = next_v
                
    return hist


def remove_delta_condition(hist):
    return np.sum(hist, 1)
                  
    
def remove_interval_condition(hist):
    return np.sum(hist, 2)

In [25]:
import hnswlib


def extract_flat_feats_rand(len):
    r = np.random.rand(len)
    return np.divide(r, np.sum(r))


def extract_flat_feats(hist):
    reduced_hist = np.sum(hist, (1,2))
    for j in range(13):
        reduced_hist[j] = np.divide(reduced_hist[j], np.sum(reduced_hist[j]))
    result = np.ndarray.flatten(reduced_hist)
    return result


def cluster(tuples):
    descriptor_index = hnswlib.Index(space = 'cosine', dim = 13*delta_bin_count) 
    descriptor_index.init_index(max_elements = 2000, ef_construction = 200, M = 48)
    
    descriptors = []
    hists = []
    
    for i in range(len(tuples)):
        t = tuples[i]
        hist = build_transition_histogram([t])
        hists.append(hist)
        descriptors.append(extract_flat_feats(hist))

    descriptor_index.add_items(descriptors)
    
    return (descriptors, hists, descriptor_index)



df = pd.read_csv('data/FSDKaggle2018.meta/test_post_competition_scoring_clips.csv', delimiter=',')
tuples = [tuple(x) for x in df.values]

descriptors, hists, descriptor_index = cluster(tuples)


################

# n = 3

# knn = descriptor_index.knn_query(descriptors, k=1+n)
    
# same_class_count = 0.0
    
# for i in range(len(knn[0])):
#     for k in range(n):
#         j = knn[0][i,k+1]
#         if tuples[i][1] == tuples[j][1]:
#             same_class_count += 1.0
                    
# same_class_count/(n*len(tuples))

################


clustered_hists = []

for i in range(len(tuples)):  
    knn = descriptor_index.knn_query([descriptors[i]], k=250)[0][0]
    clustered_hist = np.sum(np.array([hists[index] for index in knn]), (0,))
    
    # set all zero elements to one
    clustered_hist = np.where(clustered_hist==0, 1, clustered_hist)
    
    # normalize, log probability
    for j in range(13):
        for k in range(delta_bin_count):
            for l in range (interval_bin_count):
                clustered_hist[j,k,l] = np.log(np.divide(clustered_hist[j,k,l], np.sum(clustered_hist[j,k,l])))
        
    clustered_hists.append(clustered_hist)


for every hist, find n nearest neighbors, element wise sum them all together, replace all zero entries with a value of one

for test imput sequence calculate sequence likelyhood for every summed together histogram, check if the highest scoring one has the same category as the summed histograms original sequence

In [26]:
clustered_hists[8][8,8]

array([[-3.72970145, -2.95651156, -2.22562405, -2.26336438, -2.95651156,
        -2.57702194, -2.68824757, -3.12356565, -3.44201938, -2.81341072,
        -2.68824757, -2.43041846, -2.7488722 , -2.68824757, -2.95651156,
        -3.21887582],
       [-4.90527478, -3.65251181, -3.40119738, -2.46292774, -2.55389952,
        -2.07206143, -2.65398298, -2.95936463, -3.0334726 , -2.30258509,
        -2.34032542, -2.7080502 , -2.37954613, -2.7080502 , -3.29583687,
        -4.2121276 ],
       [-4.26619482, -3.01343185, -2.76211742, -2.32028467, -2.35665231,
        -2.76211742, -2.56144673, -3.01343185, -3.01343185, -2.43361336,
        -2.65675691, -2.47443535, -2.56144673, -2.81927584, -2.94443898,
        -4.55387689],
       [-4.4288307 , -3.04253634, -2.72408261, -2.26934645, -2.30856716,
        -2.41392768, -2.53171072, -3.58153284, -3.33021841, -2.63707123,
        -2.37044257, -2.48292055, -2.55702852, -3.08509595, -2.9247533 ,
        -4.27468002],
       [-4.4091553 , -3.82136864, -2

0.20875, "reduced_hist = np.sum(hist, (1,2)) cosine similarity"

0.20583333333333334, "reduced_hist = np.sum(hist, (1,2))"

0.17958333333333334, "reduced_hist = np.sum(hist, (0,1))"

0.17354166666666668, "reduced_hist = np.sum(hist, (0,2))"

0.15166666666666667  "reduced_hist[i, math.floor(k/4.0), math.floor(l/4.0)]"

0.14208333333333334, "reduced_hist = np.sum(hist, (1,))"

0.10708333333333334, "reduced_hist = np.sum(hist, (1,3))"

0.033541666666666664 "extract_flat_feats_rand(208)"

In [40]:
def encode_mel(mel):
    
    encoded = np.full((np.shape(mel)[1], np.shape(mel)[0], 4), -1)
    
    for k in range(np.shape(mel)[1]): # for each feature
            history_times = np.full((delta_bin_count,), -1)
            t = 0
            last_delta_bin_1 = -1
            last_delta_bin_2 = -1

            last_v_1 = float('nan')
            
            for j in range(np.shape(mel)[0]): # np.shape(mel)[0] for each value

                next_v = mel[j,k]
                current_v = last_v_1
                current_delta_bin = last_delta_bin_1
                last_delta_bin = last_delta_bin_2

                if not math.isnan(current_v):
                    next_delta = next_v - current_v   
                    next_delta_bin = get_delta_bin(next_delta)

                    if current_delta_bin > -1:
                        current_interval = get_interval(t, history_times)
                        current_interval_bin = get_interval_bin(current_interval)
                        
                        #############
                        
                        encoded[k, j, 0] = k
                        encoded[k, j, 1] = current_delta_bin
                        encoded[k, j, 2] = current_interval_bin
                        encoded[k, j, 3] = next_delta_bin
                
                        #############
                        
                        history_times[current_delta_bin] = t

                        last_delta_bin_2 = current_delta_bin
                        t+= 1

                    last_delta_bin_1 = next_delta_bin

                last_v_1 = next_v
    
    return encoded[:,2:,:].reshape((np.shape(mel)[0]-2) * np.shape(mel)[1], 4)


def likelihood_hist(hist, clustered_hist):
    return np.sum(np.multiply(hist, clustered_hist))


def likelihood_mel(encoded_mel, clustered_hist):
    p = 0.0
    for k in encoded_mel:
        p += clustered_hist[tuple(k)]    
    return p


def likeliest_hist(hist, clustered_hists, n):
    l = np.zeros(len(clustered_hists))
    
    print(len(clustered_hists))
    
    for i in range(len(clustered_hists)):
        if(i%200 == 0):
            print(i)
        l[i] = likelihood_hist(hist, clustered_hists[i])
        
    s = np.argsort(-l)[:n]
    return (s, l[s])


def likeliest_mel(mel, clustered_hists, n):
    l = np.zeros(len(clustered_hists))
    
    print(len(clustered_hists))
    
    encoded_mel = encode_mel(mel)
    
    for i in range(len(clustered_hists)):
        if(i%200 == 0):
            print(i)
        l[i] = likelihood_mel(encoded_mel, clustered_hists[i])
        
    s = np.argsort(-l)[:n]
    return (s, l[s])

In [42]:
mel = mels_from_tuples([tuples[89]])[0]
# encoded_mel = encode_mel(mel[:100,:])
print(np.shape(mel))
likeliest_mel(mel, clustered_hists, 10)

(419, 13)
1600
0
200
400
600
800
1000
1200
1400


(array([1267, 1580, 1510,   59, 1531,  791,  627,  858, 1528,  228]),
 array([-13815.48370268, -13819.95080261, -13824.71646915, -13829.79670764,
        -13830.2118661 , -13830.53680435, -13831.73751549, -13832.11319527,
        -13834.71914204, -13834.71992995]))

In [47]:
likeliest_hist(hists[0], clustered_hists, 10)

1600
0
200
400
600
800
1000
1200
1400


(array([1029, 1246,  863,  959,  212, 1391,  523,  850,  657, 1161]),
 array([-6456.83595584, -6459.00697724, -6459.26115932, -6460.93123733,
        -6461.73046159, -6462.37650453, -6462.71506957, -6463.22854251,
        -6463.27241612, -6463.39803372]))

In [48]:
tuples[0][1]

'Oboe'

In [49]:
tuples[1029][1]

'Harmonica'

In [53]:
tuples[212][1]

'Oboe'