In [1]:
#!/usr/bin/python

from tensorflow.contrib.learn.python.learn.datasets import base
import tensorflow as tf
import numpy as np
import os,sys
sys.path.insert(0, './scripts')
import py_compile
py_compile.compile('scripts/ivector_tools.py')
py_compile.compile('scripts/siamese_model_phone_ru.py')
import ivector_dataset
import siamese_model_phone_ru as siamese_model
import ivector_tools as it


In [2]:
def get_dataset_size(dict_file,feat_file):
# Counting feature dimension and total number of utterances
    f = open(dict_file)
    dict_dim = 0
    for line in f:
        dict_dim+=1
    f.close()
    feat_len = 0
    f = open(feat_file)
    for line in f:
        feat_len+=1
    f.close()
    return dict_dim, feat_len

def get_feat_label(dict_file, feat_file):
# Get feature vectors from files
    dict_dim, feat_len = get_dataset_size(dict_file,feat_file)
    features = np.zeros((feat_len,dict_dim),dtype='float32')
    labels = np.zeros((feat_len),dtype='int8')
    names = []
    f = open(feat_file)
    count = 0
    for line in f:
        names.append(line.split()[0])
        labels[count] = line.split()[1]
        line= line.split()[2:]
        for iter in range(0,len(line)):
            elements = line[iter].split(':')
            features[count][ int( elements[0] ) -1 ] = elements[1]
        count = count + 1 
    f.close()
    
    return features, labels, names

In [3]:
context = 3
dict_file = 'data/train.vardial2017/dict.phone_ru.c'+str(context)
feat_file = 'data/train.vardial2017/phone_ru.c'+str(context)
trn_features, trn_labels, trn_names = get_feat_label(dict_file,feat_file)

feat_file = 'data/dev.vardial2017/phone_ru.c'+str(context)
dev_features, dev_labels, dev_names = get_feat_label(dict_file,feat_file)

feat_file = 'data/test.MGB3/phone_ru.c'+str(context)
tst_features, tst_labels, tst_names = get_feat_label(dict_file,feat_file)

print trn_features.shape, dev_features.shape, tst_features.shape

(13825, 51102) (1524, 51102) (1492, 51102)


In [4]:
languages = ['EGY','GLF','LAV','MSA','NOR']



# load tst.MGB3 labels
filename = 'data/test.MGB3/reference'
tst_ref_names = np.loadtxt(filename,usecols=[0],dtype='string')
tst_ref_labels = np.loadtxt(filename,usecols=[1],dtype='int')

tst_labels_index = []
for i,name in enumerate(tst_names):
    for j, name_ref in enumerate(tst_ref_names):
        if name == name_ref:
            tst_labels_index = np.append(tst_labels_index,j)

tst_labels = np.empty((np.size(tst_labels_index)))
for i,j in enumerate(tst_labels_index):
    tst_labels[i]=tst_ref_labels[int(j)]

In [6]:
#language modeling
lang_mean=[]
for i, lang in enumerate(languages):
     lang_mean.append(np.mean(np.append(trn_features[np.nonzero(trn_labels == i+1)] ,dev_features[np.nonzero(dev_labels == i+1)],axis=0),axis=0))

lang_mean = np.array(lang_mean)
lang_mean = it.length_norm(lang_mean)

print( np.shape(trn_features), np.shape(dev_features), np.shape(lang_mean),np.shape(tst_features) )

((13825, 51102), (1524, 51102), (5, 51102), (1492, 51102))


In [7]:
# Baseline performance on TST using CDS
tst_scores = lang_mean.dot(tst_features.transpose() )
# print(tst_scores.shape)
hypo_lang = np.argmax(tst_scores,axis = 0)
temp = ((tst_labels-1) - hypo_lang)
acc =1- np.size(np.nonzero(temp)) / float(np.size(tst_labels))
print 'Baseline accurary on test dataset : %0.3f' %(acc)

# Baseline performance on DEV using CDS
dev_scores = lang_mean.dot(dev_features.transpose() )
hypo_lang = np.argmax(dev_scores,axis = 0)
temp = ((dev_labels-1) - hypo_lang)
acc =1- np.size(np.nonzero(temp)) / float(np.size(dev_labels))
print 'Baseline accurary on dev dataset : %0.3f' %(acc)


Baseline accurary on test dataset : 0.368
Baseline accurary on dev dataset : 0.386


In [8]:
# making pair of train i-vector with mean of each language i-vector
#  example : for total 3 ivectors
#  ivector   ivector_p  label
#     1         1         1
#     1         2         0
#     1         3         0
#     2         1         0
#     2         2         1
#     ...      ...       ...
#     3         3         1

# preparing pair labels
sim = []
pair_a_idx = []
pair_b_idx = []
for i, lang in enumerate(languages):
    for j, label in enumerate(trn_labels):
#         print i, j, label
        pair_a_idx.append(i+1)
        pair_b_idx.append(j)
        if i+1 == label:
            sim.append(1)
        else:
            sim.append(0)
print(np.shape(pair_a_idx),np.shape(pair_b_idx), np.shape(sim))
pair_a_idx=np.array(pair_a_idx)
pair_b_idx=np.array(pair_b_idx)
sim = np.array(sim)

#shuffling
shuffleidx = np.arange(0,np.size(pair_a_idx))
np.random.shuffle(shuffleidx)
pair_a_idx = pair_a_idx[shuffleidx]
pair_b_idx = pair_b_idx[shuffleidx]
sim = sim[shuffleidx]


data = []
data_p = []
    
for iter in np.arange(0,np.size(sim)) :
    data.append( lang_mean[pair_a_idx[iter]-1] )
    data_p.append( trn_features[pair_b_idx[iter]] )
data = np.array(data)
data_p = np.array(data_p)

# TRN dataset loading for feeding 
tar_data = data[sim==1]
tar_data_p = data_p[sim==1]
tar_sim = sim[sim==1]
non_data = data[sim==0]
non_data_p = data_p[sim==0]
non_sim = sim[sim==0]
print(tar_data.shape, tar_data_p.shape,tar_sim.shape,non_data.shape,non_data_p.shape,non_sim.shape)

trn_tar = ivector_dataset.DataSet(tar_data,tar_sim)
trn_tar_p = ivector_dataset.DataSet(tar_data_p,tar_sim)

trn_non = ivector_dataset.DataSet(non_data,non_sim)
trn_non_p = ivector_dataset.DataSet(non_data_p,non_sim)


((69125,), (69125,), (69125,))
((13825, 51102), (13825, 51102), (13825,), (55300, 51102), (55300, 51102), (55300,))


In [9]:
# preparing pair labels of DEV dataset
sim = []
pair_a_idx = []
pair_b_idx = []
for i, lang in enumerate(languages):
    for j, label in enumerate(dev_labels):
#         print i, j, label
        pair_a_idx.append(i+1)
        pair_b_idx.append(j)
        if i+1 == label:
            sim.append(1)
        else:
            sim.append(0)
print(np.shape(pair_a_idx),np.shape(pair_b_idx), np.shape(sim))
pair_a_idx=np.array(pair_a_idx)
pair_b_idx=np.array(pair_b_idx)
sim = np.array(sim)

#shuffling
shuffleidx = np.arange(0,np.size(pair_a_idx))
np.random.shuffle(shuffleidx)
pair_a_idx = pair_a_idx[shuffleidx]
pair_b_idx = pair_b_idx[shuffleidx]
sim = sim[shuffleidx]


data = []
data_p = []
    
for iter in np.arange(0,np.size(sim)) :
    data.append( lang_mean[pair_a_idx[iter]-1] )
    data_p.append( dev_features[pair_b_idx[iter]] )
data = np.array(data)
data_p = np.array(data_p)

# DEV dataset loading for feeding 
tar_data = data[sim==1]
tar_data_p = data_p[sim==1]
tar_sim = sim[sim==1]
non_data = data[sim==0]
non_data_p = data_p[sim==0]
non_sim = sim[sim==0]
print(tar_data.shape, tar_data_p.shape,tar_sim.shape,non_data.shape,non_data_p.shape,non_sim.shape)

dev_tar = ivector_dataset.DataSet(tar_data,tar_sim)
dev_tar_p = ivector_dataset.DataSet(tar_data_p,tar_sim)

dev_non = ivector_dataset.DataSet(non_data,non_sim)
dev_non_p = ivector_dataset.DataSet(non_data_p,non_sim)


((7620,), (7620,), (7620,))
((1524, 51102), (1524, 51102), (1524,), (6096, 51102), (6096, 51102), (6096,))


In [10]:
# init variables
sess = tf.InteractiveSession()
siamese = siamese_model.siamese(np.shape(trn_features)[1]);
global_step = tf.Variable(0, trainable=False)
learning_rate = tf.train.exponential_decay(0.01, global_step,
                                           5000, 0.99, staircase=True)
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(siamese.loss, global_step=global_step)
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

(?, 92920)
(?, 1500)
(?, 600)
(?, 92920)
(?, 1500)
(?, 600)


In [11]:
#start training
batch_size = 40
max_acc = 0.0
max_step=0
saver_folder='snnmodel_phone_ru'
if not os.path.exists(saver_folder):
    os.mkdir(saver_folder)
for step in range(500000):
  
    if step %5 ==0:
        batch_x1_a, batch_y1_a = trn_tar.next_batch(batch_size,shuffle=False)
        batch_x2_a, batch_y2_a = trn_tar_p.next_batch(batch_size,shuffle=False)
        batch_x1_b, batch_y1_b = trn_non.next_batch(batch_size,shuffle=False)
        batch_x2_b, batch_y2_b = trn_non_p.next_batch(batch_size,shuffle=False)
        batch_x1 = np.append(batch_x1_a,batch_x1_b,axis=0)
        batch_y1 = np.append(batch_y1_a,batch_y1_b,axis=0)
        batch_x2 = np.append(batch_x2_a,batch_x2_b,axis=0)
        batch_y2 = np.append(batch_y2_a,batch_y2_b,axis=0)
    else:
        batch_x1_a, batch_y1_a = dev_tar.next_batch(batch_size,shuffle=False)
        batch_x2_a, batch_y2_a = dev_tar_p.next_batch(batch_size,shuffle=False)
        batch_x1_b, batch_y1_b = dev_non.next_batch(batch_size,shuffle=False)
        batch_x2_b, batch_y2_b = dev_non_p.next_batch(batch_size,shuffle=False)
        batch_x1 = np.append(batch_x1_a,batch_x1_b,axis=0)
        batch_y1 = np.append(batch_y1_a,batch_y1_b,axis=0)
        batch_x2 = np.append(batch_x2_a,batch_x2_b,axis=0)
        batch_y2 = np.append(batch_y2_a,batch_y2_b,axis=0)
        
    batch_y = batch_y1*2-1
    
    _, loss_v = sess.run([train_step, siamese.loss], feed_dict={
        siamese.x1: batch_x1,
        siamese.x2: batch_x2,
        siamese.y_: batch_y
    })
    
    if np.isnan(loss_v):
        print ('Model diverged with loss = NAN')
        quit()
        
    if step % 100 ==0:
        dev_features_siam = siamese.o1.eval({siamese.x1:dev_features})
        lang_mean_siam = siamese.o1.eval({siamese.x1:lang_mean})
        tst_features_siam = siamese.o1.eval({siamese.x1:tst_features})
        
        dev_scores = lang_mean_siam.dot(dev_features_siam.transpose() )
        hypo_lang = np.argmax(dev_scores,axis = 0)
        temp = ((dev_labels-1) - hypo_lang)
        acc =1- np.size(np.nonzero(temp)) / float( np.size(dev_labels) )
        
        tst_scores = lang_mean_siam.dot(tst_features_siam.transpose() )
        hypo_lang = np.argmax(tst_scores,axis = 0)
        temp = ((tst_labels-1) - hypo_lang)
        acc_tst =1- np.size(np.nonzero(temp)) / float(np.size(tst_labels))

        if max_acc < acc_tst:
            max_acc = acc_tst
            max_step=step
            print ('Step %d: loss %f, Acc.: (DEV)%.3f (TST)%.3f, lr : %.5f' % (step,loss_v,acc,acc_tst,sess.run(learning_rate)))
            saver.save(sess, saver_folder+'/model'+str(step)+'.ckpt')
        if loss_v<0.6:
            break


Step 0: loss 1.000433, Acc.: (DEV)0.287 (TST)0.299, lr : 0.01000
Step 3300: loss 1.000023, Acc.: (DEV)0.312 (TST)0.303, lr : 0.01000
Step 3400: loss 0.999996, Acc.: (DEV)0.320 (TST)0.305, lr : 0.01000
Step 3500: loss 1.000026, Acc.: (DEV)0.320 (TST)0.306, lr : 0.01000
Step 3700: loss 0.999985, Acc.: (DEV)0.316 (TST)0.307, lr : 0.01000
Step 3800: loss 1.000103, Acc.: (DEV)0.318 (TST)0.308, lr : 0.01000
Step 5100: loss 0.999928, Acc.: (DEV)0.316 (TST)0.312, lr : 0.00990
Step 5300: loss 0.999905, Acc.: (DEV)0.320 (TST)0.314, lr : 0.00990
Step 5400: loss 1.000028, Acc.: (DEV)0.332 (TST)0.316, lr : 0.00990
Step 8800: loss 0.999512, Acc.: (DEV)0.329 (TST)0.317, lr : 0.00990
Step 9600: loss 0.999934, Acc.: (DEV)0.329 (TST)0.318, lr : 0.00990
Step 11100: loss 0.999912, Acc.: (DEV)0.329 (TST)0.320, lr : 0.00980
Step 22700: loss 1.000817, Acc.: (DEV)0.333 (TST)0.325, lr : 0.00961
Step 23000: loss 0.985440, Acc.: (DEV)0.333 (TST)0.326, lr : 0.00961
Step 23500: loss 0.984918, Acc.: (DEV)0.364 (TST

In [12]:
print max_step
RESTORE_STEP=max_step
saver.restore(sess, saver_folder+'/model'+str(RESTORE_STEP)+'.ckpt')


# trn_features_siam = siamese.o1.eval({siamese.x1:trn_features})
dev_features_siam = siamese.o1.eval({siamese.x1:dev_features})
tst_features_siam = siamese.o1.eval({siamese.x1:tst_features})
lang_mean_siam = siamese.o1.eval({siamese.x1:lang_mean})

tst_scores = lang_mean_siam.dot(tst_features_siam.transpose() )
# print(tst_scores.shape)
hypo_lang = np.argmax(tst_scores,axis = 0)
temp = ((tst_labels-1) - hypo_lang)
acc =1- np.size(np.nonzero(temp)) / float(np.size(tst_labels))
print 'Final accurary on test dataset : %0.3f' %(acc)

47300
INFO:tensorflow:Restoring parameters from snnmodel_phone_ru/model47300.ckpt
Final accurary on test dataset : 0.442


In [13]:
confusionmat = np.zeros((5,5))
for i,lang in enumerate(languages):
    hypo_bylang = hypo_lang[ tst_labels == i+1]
    hist_bylang = np.histogram(hypo_bylang,5)
    confusionmat[:,i] = hist_bylang[0]

precision = np.diag(confusionmat) / np.sum(confusionmat,axis=1) #precision
recall = np.diag(confusionmat) / np.sum(confusionmat,axis=0) # recall
    
print 'Confusion matrix'
print confusionmat
print 'Precision'
print precision
print 'Recall'
print recall

print '\n\n<Performance evaluation on Test dataset>'
print 'Accurary  : %0.3f' %(acc)
print 'Precision : %0.3f' %(np.mean(precision))
print 'Recall    : %0.3f' %(np.mean(recall))

Confusion matrix
[[   1.    9.    2.    6.    1.]
 [ 180.  161.  225.   37.    0.]
 [   0.    0.    0.    0.   81.]
 [  30.   26.   26.  179.   15.]
 [  91.   54.   81.   40.  247.]]
Precision
[ 0.05263158  0.26699834  0.          0.64855072  0.48148148]
Recall
[ 0.00331126  0.644       0.          0.68320611  0.71802326]


<Performance evaluation on Test dataset>
Accurary  : 0.442
Precision : 0.290
Recall    : 0.410
