In [413]:
import numpy as np
import pandas as pd
import os
import math
from math import ceil
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler
import json as JSON

In [624]:
def save_np_array(arr, filename):
    assert(type(filename) == str)
    np.save(filename, arr)

def load_np_array(filename):
    return np.load(filename)

In [625]:
EMBEDDINGS = 'output/embeddings/'
LAYER4 = EMBEDDINGS + 'layer4/'
LAYER5 = EMBEDDINGS + 'layer5/'
LAYER6 = EMBEDDINGS + 'layer6/'

In [626]:
currgram = '1_gram/'
TESTEMBEDDINGS = 'new_test_output/' + currgram + 'embeddings/'
TESTLAYER4 = TESTEMBEDDINGS + 'layer4/'
TESTLAYER5 = TESTEMBEDDINGS + 'layer5/'
TESTLAYER6 = TESTEMBEDDINGS + 'layer6/'

In [627]:
currtestlayer = TESTLAYER6

In [628]:
'''
Time in ms
Stride length in ms
Sampling time in ms
All lengths are relative to pool time, since pooling is a preprocessing step
'''
currlayer = LAYER6
stride_time = 100
pool_time = 100
row_time = 500
sampling_time = 20

pool_length = pool_time//sampling_time
feature_concats = row_time//pool_time
stride_length = stride_time//pool_time

In [629]:
def extract_files_dict():
    np_files = os.listdir(currlayer)
    file_dict = {}
    for file in np_files:
        orig = file.split('_')[0]
        if orig in file_dict:
            file_dict[orig].append(file)
        else:
            file_dict[orig] = [file]
    return file_dict

In [630]:
'''
Row - Corresponds roughly to 1 unit time
Bucket - Multiple Rows of features corresponding to 1 file
Number of Rows in a bucket depends on the stride
Columns - Same for all buckets, depends on the time
'''

def pool_one_batch(arr):
    return np.max(arr, axis=0)

def pool_full_matrix(arr):
    rows = np.zeros(arr.shape[1])
    for i in range(ceil(arr.shape[0]/pool_length)):
        start_row = i*stride_length
        end_row = (i+1)*stride_length
        row = pool_one_batch(arr[start_row:end_row,:])
        rows = np.vstack((rows,row))
    return rows[1:,:]

def concatenate_rows(arr, cols):
    row = np.ones(0)
    for i in range(arr.shape[0]):
        row = np.hstack((row, arr[i,:]))
    assert(len(row) <= cols)
    row = np.hstack((row, np.zeros(cols-len(row))))
    return row

def extract_rows(arr):
    cols = feature_concats*arr.shape[1]
    rows = np.zeros((0,cols))
    for i in range(ceil(arr.shape[0]/stride_length)):
        start_row = i*stride_length
        end_row = i*stride_length + feature_concats
        row = concatenate_rows(arr[start_row:end_row,:], cols)
        rows = np.vstack((rows, row))
    return rows
    
def extract_bucket(np_arrs):
    buck = np.zeros(feature_concats*np_arrs[0].shape[1])
    for arr in np_arrs:
        buck = np.vstack((buck, extract_rows(pool_full_matrix(arr))))
    return buck[1:,:]

In [631]:
def create_search_matrix():
    file_dict = extract_files_dict()
    buckets = np.zeros(0)
    '''Start Index, End Index + 1, Filename'''
    indexes = []
    for key in file_dict:
        np_arrs = []
        for file in file_dict[key]:
            np_arrs.append(load_np_array(currlayer + file))
        bucket = extract_bucket(np_arrs)
        if (buckets.shape[0] == 0):
            start_index = 0
            buckets = bucket
        else:
            start_index = buckets.shape[0]
            buckets = np.vstack((buckets, bucket))
        end_index = buckets.shape[0]
        indexes.append((start_index, end_index, key))
    return buckets, indexes

In [632]:
buckets, indexes = create_search_matrix()

In [633]:
print(buckets.shape)
print(len(indexes))

(5051, 145)
101


In [634]:
class KNN:
    def __init__(self, buckets, indexes, n_components=100):
        self.buckets = buckets
        self.indexes = indexes
        self.v2 = self.precompute(self.buckets).reshape((-1,1))
        self.buckmean = np.mean(self.buckets, axis=0)
        self.pca = decomposition.PCA(n_components=n_components)
        self.pcabuckets = self.buckets - self.buckmean
        self.pcabuckets = self.pca.fit_transform(self.pcabuckets)
        self.pcav2 = self.precompute(self.pcabuckets).reshape((-1,1))
        
    def precompute(self, buck):
        arr = []
        for i in range(buck.shape[0]):
            arr.append(buck[i,:].dot(buck[i,:]))
            ###Check this
#             self.buckets[i,:] = self.buckets[i,:]/math.sqrt(arr[-1])
#             arr[-1] = 1.0
        return np.array(arr)
    
    def preprocess_inference(self, phrase_matrix, do_pca=False):
        l2 = self.l2norms(phrase_matrix)
        if do_pca:
            return self.pca.transform(phrase_matrix - self.buckmean)
        else:
#             for i in range(phrase_matrix.shape[0]):
#                 phrase_matrix[i,:] = phrase_matrix[i,:]/math.sqrt(l2[0][i])
            return phrase_matrix
        
    def near_vector_scores(self, vector):
        return (self.v2 + self.buckets.dot(vector.reshape((-1,1))) ) - vector.dot(vector)
    
    def l2norms(self, matrix):
        return np.hstack([(np.ones(self.buckets.shape[0])*matrix[i,:].dot(matrix[i,:].T)).reshape((-1,1)) for i in range(matrix.shape[0])])
    
    def reduce_to_buckets(self, matrix_scores):
        return np.sum(np.vstack([np.min(matrix_scores[ind[0]:ind[1],:], axis=0) for ind in self.indexes]), axis=1)
    
    '''
    Here, phrase is a Matrix
    Each Phrase is arranged in a row
    '''
    def near_bucket_scores(self, phrase_matrix):
        matrix = self.preprocess_inference(phrase_matrix)
        print(matrix.shape)
        matrix_scores = self.v2 - 2*self.buckets.dot(matrix.T) + self.l2norms(matrix)
#         matrix_scores = 2-2*self.buckets.dot(matrix.T)
        print(matrix_scores.shape)
        bucket_scores = self.reduce_to_buckets(matrix_scores)
        return bucket_scores
    
    '''
    Here, phrase is a Matrix
    Each Phrase is arranged in a row
    '''
    def near_bucket_scores_pca(self, phrase_matrix):
        matrix = self.preprocess_inference(phrase_matrix, do_pca=True)
#         print(matrix.shape)
        matrix_scores = self.pcav2 - 2*self.pcabuckets.dot(matrix.T) + self.l2norms(matrix)
#         print(matrix_scores.shape)
        bucket_scores = self.reduce_to_buckets(matrix_scores)
        return bucket_scores
    
    def extract_nearby_buckets(self, phrase_matrix, threshold, do_pca=False):
        arr = []
        buck_scores = np.zeros(0)
        if do_pca:
            buck_scores = self.near_bucket_scores_pca(phrase_matrix)
        else:
            buck_scores = self.near_bucket_scores(phrase_matrix)
        for i, score in enumerate(buck_scores):
            if score < threshold:
#                 print(score)
                arr.append(i)
        return arr
        

In [635]:
knn = KNN(buckets, indexes, 100)

In [636]:
i = 22
print(knn.extract_nearby_buckets(buckets[indexes[i][0]:indexes[i][1]-1,:], 0.00001, do_pca=False))

(51, 145)
(5051, 51)
[22]


In [637]:
def extract_test_files():
#     print(currtestlayer)
    np_files = os.listdir(currtestlayer)
    file_dict = {}
    for file in np_files:
        orig = file.split('.')[0] + '.wav'
#         if orig in file_dict:
#             file_dict[orig].append(file)
#         else:
        file_dict[orig] = file
    return file_dict

with open('new_test_output/test_phrase_mapping_1.json') as f:
    ground_truth_1_gram = JSON.load(f)
with open('new_test_output/test_phrase_mapping_2.json') as f:
    ground_truth_2_gram = JSON.load(f)
with open('new_test_output/test_phrase_mapping_6.json') as f:
    ground_truth_6_gram = JSON.load(f)
    
# print(ground_truth_1_gram)
test_files_dict = extract_test_files()
# print(test_files_dict)

In [638]:
# def test(file_dict):
#     
#
test_buck = extract_bucket([load_np_array(currtestlayer + test_files_dict['TP_OD_1.wav'])])
print(test_buck.shape)
x = knn.extract_nearby_buckets(test_buck, 9000, do_pca=False)
print(ground_truth_2_gram['TP_OD_1.wav'])
print([indexes[y] for y in x])

(8, 145)
(8, 145)
(5051, 8)
['000001426']
[(355, 515, '000001355'), (1598, 1622, '000001381'), (1879, 1899, '000001390'), (2305, 2376, '000001399'), (2450, 2480, '000001401'), (2892, 2917, '000001405'), (3168, 3191, '000001411'), (3548, 3574, '000001419'), (4260, 4282, '000001437'), (4354, 4375, '000001440')]


In [664]:
test_buck = extract_bucket([load_np_array('sanity_test/embeddings/layer6/000001374_0.npy')[0:12]])
print(test_buck.shape)
# test_buck[10,3] = 1.0
# test_buck[10,5] = 3.0
# test_buck[10,7] = 2.0
x = knn.extract_nearby_buckets(test_buck[1:2,:], 1000, do_pca=False)
# print(ground_truth_2_gram['TP_OD_1.wav'])
print([indexes[y] for y in x])

(3, 145)
(1, 145)
(5051, 1)
[(355, 515, '000001355'), (1598, 1622, '000001381'), (1787, 1812, '000001387'), (1879, 1899, '000001390'), (2111, 2134, '000001396'), (2176, 2305, '000001398'), (2892, 2917, '000001405'), (3168, 3191, '000001411'), (3548, 3574, '000001419'), (3778, 3802, '000001426'), (3830, 3849, '000001428'), (4260, 4282, '000001437'), (4531, 4552, '000001445'), (4929, 5051, '000001451')]


In [623]:
test_buck[:1,:] #- buckets[:1,:]

array([[ -6.430233  ,  -5.92282915,  -5.93382025,  -6.78656721,
         -5.51487017,  -5.0272274 , -11.01980782,  -5.19922113,
         -5.34897995,  -4.45374918, -10.27648354, -11.87998295,
        -11.9861784 ,  -8.95866585,  -9.60204315,  -6.76203966,
         -6.08948326, -18.74069214,  -9.28351688,  -5.58074236,
         -3.26582623,  -8.85853004, -13.17168903,  -6.49048662,
        -15.57391453,  -5.33720255, -17.69064713,  -6.66455173,
         14.50683594,  -7.25215387,  -3.90565658,  -3.62913179,
         -7.21197414,  -7.80587053,  -3.86678934,  -9.76060677,
         -7.77058744,  -3.98853803,  -2.56414175,  -9.94606495,
        -10.40016365, -11.06022549,  -9.71681595, -10.44680214,
         -4.95135307,  -7.39020681, -18.38525772,  -9.37936878,
         -6.58867645,  -3.78512263,  -5.93035316, -14.55838871,
         -5.51263952, -18.73148155,  -2.24049592, -22.25370216,
         -6.67302227,  15.77115154, -11.19170189,  -3.8334446 ,
         -4.05012274,  -6.94281006, -10.

In [562]:
# print(indexes[x])
print(indexes[23])
print(indexes[18])
print(indexes[39])
print(indexes[45])
print(indexes[54])
print(indexes[60])
# indexes

(1311, 1342, '000001374')
(1086, 1139, '000001369')
(1879, 1899, '000001390')
(2111, 2134, '000001396')
(2892, 2917, '000001405')
(3168, 3191, '000001411')


In [None]:
for i in range(100):
    print(indexes[i][0])
    print(indexes[i][1])
    '''Sanity Check'''
    scores = knn.near_bucket_scores_pca(buckets[indexes[i][0]:indexes[i][1],:])
    # scores = knn.near_bucket_scores(np.vstack((np.zeros(290), 20*np.ones(290), 10*np.ones(290))))
    
    assert(np.argmin(scores) == i)
    (indexes[np.argmin(scores)], np.min(scores))

In [151]:
n = np.ones((5,3))
n[0][0] = 1
n[0][2] = 3
n[1][2] = 2
n[1][0] = 4
n[2][0] = 5
n[2][2] = 6
n[3][0] = 3
n[4][2] = 8
n = np.vstack((n,n,n))
print(n)
feature_concats = 3
stride_length = 3
print(extract_bucket([n]))

[[1. 1. 3.]
 [4. 1. 2.]
 [5. 1. 6.]
 [3. 1. 1.]
 [1. 1. 8.]
 [1. 1. 3.]
 [4. 1. 2.]
 [5. 1. 6.]
 [3. 1. 1.]
 [1. 1. 8.]
 [1. 1. 3.]
 [4. 1. 2.]
 [5. 1. 6.]
 [3. 1. 1.]
 [1. 1. 8.]]
[[3.33333333 1.         3.66666667]
 [1.66666667 1.         4.        ]
 [4.         1.         3.        ]
 [2.         1.         4.33333333]
 [3.         1.         5.        ]]
[[3.33333333 1.         3.66666667 1.66666667 1.         4.
  4.         1.         3.        ]
 [2.         1.         4.33333333 3.         1.         5.
  0.         0.         0.        ]]
[[3.33333333 1.         3.66666667 1.66666667 1.         4.
  4.         1.         3.        ]
 [2.         1.         4.33333333 3.         1.         5.
  0.         0.         0.        ]]


In [570]:
n[4,:] + n[4,:].dot(n[4,:].T)
n[0,2]

3.0

In [243]:
mat = n[:3,:]
print(mat)
print(mat[:,2].reshape((-1,1)))
print(np.hstack([(np.ones(mat.shape[1])*mat[i,:].dot(mat[i,:].T)).reshape((-1,1)) for i in range(mat.shape[0])]) + mat[:,2].reshape((-1,1))) 

[[1. 1. 3.]
 [4. 1. 2.]
 [5. 1. 6.]]
[[3.]
 [2.]
 [6.]]
[[14. 24. 65.]
 [13. 23. 64.]
 [17. 27. 68.]]


In [None]:
mappings = json.load('')