In [0]:
import tensorflow as tf
import librosa
import pandas as pd
import numpy as np
import math
import pickle
from random import choice, randint

# Loading Audio Data

In [0]:
with open('hw4_trs.pkl', 'rb') as f:
    data = pickle.load(f)

trn_rows = data.shape[0]
col = data.shape[1]

data_inter = []
for i in range(len(data)):
  data_inter.append(np.transpose(np.abs(librosa.stft(data[i], n_fft=1024, hop_length=512))))
data_inter = np.asarray(data_inter)

data_stft=np.pad(data_inter, ((0,0),(0,13), (0,0)), 'constant')
print(data_stft.shape)

(500, 45, 513)


In [0]:
with open('hw4_tes.pkl', 'rb') as f:
    data_test = pickle.load(f)

tes_rows = data_test.shape[0]

data_tes_stft = []
for i in range(len(data_test)):
  data_tes_stft.append(np.transpose(np.abs(librosa.stft(data_test[i], n_fft=1024, hop_length=512))))
data_tes_stft = np.asarray(data_tes_stft)

print(data_tes_stft.shape)

(200, 45, 513)


In [0]:
tes_rows

200

# Creating Train Data - Positive and Negative Pairs

In [0]:
def data_prep(data_stft, rows):
  train_pairs = []
  train_labels = []
  L = 20
  interval = 10
  row_range = rows - interval +1
  for i in range(0, row_range, interval):
    temp = []
    label_temp = []
    for j in range(L):
      temp.append([data_stft[randint(i,i+interval-1)], data_stft[randint(i,i+interval-1)]])
      label_temp.append(1)

    for i in range(L):
      if i==0:
          temp.append([data_stft[randint(i,i+interval-1)],data_stft[randint(i+interval, rows-1)]])
      elif i==(row_range-1):
          temp.append([data_stft[randint(i,i+interval-1)],data_stft[randint(0, i-1)]])
      else:
          temp.append([data_stft[randint(i,i+interval-1)],data_stft[choice([randint(0, i-1),randint(i+interval, rows-1)])] ])
      label_temp.append(0)

    train_pairs.append(temp)
    train_labels.append(label_temp)
    
    trn_pairs_arr = np.asarray(train_pairs)
    trn_label_arr = np.asarray(train_labels)

  print("Length of training pairs and labels for L=", L, "is:", len(train_labels),"and", len(train_pairs))
  return trn_pairs_arr, trn_label_arr

In [0]:
trn_pairs_arr, trn_label_arr = data_prep(data_stft, trn_rows)
tes_pairs_arr, tes_label_arr = data_prep(data_tes_stft, tes_rows)

Length of training pairs and labels for L= 20 is: 50 and 50
Length of training pairs and labels for L= 20 is: 20 and 20


In [0]:
print(data_stft.shape, data_tes_stft.shape)

(500, 45, 513) (200, 45, 513)


In [0]:
print(trn_pairs_arr.shape, trn_label_arr.shape, tes_pairs_arr.shape, tes_label_arr.shape)

(50, 40, 2, 45, 513) (50, 40) (20, 40, 2, 45, 513) (20, 40)


# Tensorflow Model for Siamese Network

In [0]:
pred_left_placeholder = tf.placeholder(tf.float32, [40, 45, 513])
pred_right_placeholder = tf.placeholder(tf.float32,[40, 45, 513])

response_placeholder = tf.placeholder(tf.float32, [40])

In [0]:
def train_network(predictor_placeholder):
  input_layer = tf.reshape(predictor_placeholder,[40, 45, 513, 1])
  conv1 = tf.layers.conv2d(inputs=input_layer, filters=16,kernel_size=[4,4],activation=tf.nn.relu,padding='same')
  pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2,2],strides=[2,2],padding="same")
  conv2 = tf.layers.conv2d(inputs=pool1, filters=32,kernel_size=[2,2],activation=tf.nn.relu,padding='same')]
  pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2,2],strides=[2,2],padding='same')
  flatten_layer = tf.layers.flatten(pool2)
  dense = tf.layers.dense(inputs=flatten_layer, units=513,activation=tf.nn.tanh)
  dt_bound = tf.layers.dense(inputs=dense, units = 513)
  dropout = tf.layers.dropout(inputs = dt_bound, rate = 0.8)
  return dropout

In [0]:
with tf.variable_scope("model"):
  output_left = train_network(pred_left_placeholder)
with tf.variable_scope("model", reuse=True):
  output_right = train_network(pred_right_placeholder)

In [0]:
print(output_left.shape, output_right.shape)

(40, 513) (40, 513)


In [0]:
eucd2 = tf.pow(tf.subtract(output_left, output_right), 2)
eucd2 = tf.reduce_sum(eucd2, 1)
inner_prod = tf.sqrt(eucd2+1e-6, name="eucd")

In [0]:
sigmoid = tf.sigmoid(inner_prod)
sigmoid.shape

TensorShape([Dimension(40)])

In [0]:
def contrastive_loss(y_true, y_pred):
    error=tf.reduce_sum(-(tf.to_float(y_true))*tf.log(y_pred + (10**-6)) - (1 - tf.to_float(y_true)) * tf.log(1 - y_pred + (10**-6)))
    return error

In [0]:
alpha = 0.000001
epoch =500

contrastive_error = contrastive_loss(response_placeholder,sigmoid)

Instructions for updating:
Use tf.cast instead.


In [0]:
tf.trainable_variables()

[<tf.Variable 'model/conv2d/kernel:0' shape=(4, 4, 1, 16) dtype=float32_ref>,
 <tf.Variable 'model/conv2d/bias:0' shape=(16,) dtype=float32_ref>,
 <tf.Variable 'model/conv2d_1/kernel:0' shape=(2, 2, 16, 32) dtype=float32_ref>,
 <tf.Variable 'model/conv2d_1/bias:0' shape=(32,) dtype=float32_ref>,
 <tf.Variable 'model/dense/kernel:0' shape=(49536, 513) dtype=float32_ref>,
 <tf.Variable 'model/dense/bias:0' shape=(513,) dtype=float32_ref>,
 <tf.Variable 'model/dense_1/kernel:0' shape=(513, 513) dtype=float32_ref>,
 <tf.Variable 'model/dense_1/bias:0' shape=(513,) dtype=float32_ref>]

In [0]:
optimiser = tf.train.AdamOptimizer(learning_rate=alpha).minimize(contrastive_error)

Instructions for updating:
Use tf.cast instead.


# Training the model

In [0]:
#initialization
init_op = tf.global_variables_initializer()

In [0]:
sess=tf.Session()
sess.run(init_op)

for ep in range(epoch):
  for i in range(50):
    minibatch = trn_pairs_arr[i]
    minibatch_label = trn_label_arr[i]
    _, c  = sess.run([optimiser, contrastive_error], feed_dict={pred_left_placeholder: minibatch[:,0,:,:], pred_right_placeholder: minibatch[:,1,:,:], response_placeholder: minibatch_label})
  if ep%10 == 0:
    print("Epoch:", (ep + 1), "cost =", c )

Epoch: 1 cost = 130.73016
Epoch: 11 cost = 54.14184
Epoch: 21 cost = 36.4458
Epoch: 31 cost = 29.943949
Epoch: 41 cost = 26.38283
Epoch: 51 cost = 24.171864
Epoch: 61 cost = 22.794044
Epoch: 71 cost = 21.9437
Epoch: 81 cost = 21.421947
Epoch: 91 cost = 21.101414
Epoch: 101 cost = 20.89481
Epoch: 111 cost = 20.747108
Epoch: 121 cost = 20.629436
Epoch: 131 cost = 20.528507
Epoch: 141 cost = 20.439484
Epoch: 151 cost = 20.36015
Epoch: 161 cost = 20.290052
Epoch: 171 cost = 20.228945
Epoch: 181 cost = 20.176321
Epoch: 191 cost = 20.131243
Epoch: 201 cost = 20.092714
Epoch: 211 cost = 20.059872
Epoch: 221 cost = 20.031548
Epoch: 231 cost = 20.006935
Epoch: 241 cost = 19.98536
Epoch: 251 cost = 19.966288
Epoch: 261 cost = 19.949888
Epoch: 271 cost = 19.935284
Epoch: 281 cost = 19.922302
Epoch: 291 cost = 19.910564
Epoch: 301 cost = 19.899853
Epoch: 311 cost = 19.890013
Epoch: 321 cost = 19.880949
Epoch: 331 cost = 19.872519
Epoch: 341 cost = 19.864626
Epoch: 351 cost = 19.857147
Epoch: 361 c

In [0]:
output = []
sig = []
for i in range(20):
  minibatch = tes_pairs_arr[i]
  minibatch_label = tes_label_arr[i]
  _,c, sig = sess.run([optimiser, contrastive_error, sigmoid], feed_dict={pred_left_placeholder: minibatch[:,0,:,:], 
                                                                          pred_right_placeholder: minibatch[:,1,:,:], 
                                                                          response_placeholder: minibatch_label})
  output.append(sig)

In [0]:
#using sigmoid value 0.65 as decision boundary
output = np.asarray(output)
for i in range(20):
  for j in range(40):
    if output[i,j] < 0.65: 
      output[i,j] = int(0)
    else: 
      output[i,j] = int(1)
      
output = np.asarray(output, dtype = int)

In [0]:
count = 0
for i in range(20):
  for j in range(40):
    if output[i,j] == tes_label_arr[i,j]:
      count = count + 1

print("Accuracy:",count/800)

Accuracy: 0.65125
