In [1]:
#!/usr/bin/python

from tensorflow.contrib.learn.python.learn.datasets import base
import tensorflow as tf
import numpy as np
import os,sys
sys.path.insert(0, './scripts')
import py_compile
py_compile.compile('scripts/ivector_tools.py')
py_compile.compile('scripts/siamese_model_words.py')
import ivector_dataset
import siamese_model_words as siamese_model
import ivector_tools as it


In [2]:
def get_dataset_size(dict_file,feat_file):
# Counting feature dimension and total number of utterances
    f = open(dict_file)
    dict_dim = 0
    for line in f:
        dict_dim+=1
    f.close()
    feat_len = 0
    f = open(feat_file)
    for line in f:
        feat_len+=1
    f.close()
    return dict_dim, feat_len

def get_feat_label(dict_file, feat_file):
# Get feature vectors from files
    dict_dim, feat_len = get_dataset_size(dict_file,feat_file)
    features = np.zeros((feat_len,dict_dim),dtype='float32')
    labels = np.zeros((feat_len),dtype='int8')
    names = []
    f = open(feat_file)
    count = 0
    for line in f:
        names.append(line.split()[0])
        labels[count] = line.split()[1]
        line= line.split()[2:]
        for iter in range(0,len(line)):
            elements = line[iter].split(':')
            features[count][ int( elements[0] ) -1 ] = elements[1]
        count = count + 1 
    f.close()
    
    return features, labels, names

In [3]:
context = 1
dict_file = 'data/train.vardial2017/dict.words.c'+str(context)
feat_file = 'data/train.vardial2017/words.c'+str(context)
trn_features, trn_labels, trn_names = get_feat_label(dict_file,feat_file)

feat_file = 'data/dev.vardial2017/words.c'+str(context)
dev_features, dev_labels, dev_names = get_feat_label(dict_file,feat_file)

feat_file = 'data/test.MGB3/words.c'+str(context)
tst_features, tst_labels, tst_names = get_feat_label(dict_file,feat_file)

print trn_features.shape, dev_features.shape, tst_features.shape

(14000, 41657) (1524, 41657) (1492, 41657)


In [4]:
languages = ['EGY','GLF','LAV','MSA','NOR']



# load tst.MGB3 labels
filename = 'data/test.MGB3/reference'
tst_ref_names = np.loadtxt(filename,usecols=[0],dtype='string')
tst_ref_labels = np.loadtxt(filename,usecols=[1],dtype='int')

tst_labels_index = []
for i,name in enumerate(tst_names):
    for j, name_ref in enumerate(tst_ref_names):
        if name == name_ref:
            tst_labels_index = np.append(tst_labels_index,j)

tst_labels = np.empty((np.size(tst_labels_index)))
for i,j in enumerate(tst_labels_index):
    tst_labels[i]=tst_ref_labels[int(j)]

In [5]:
# merge trn+dev
trndev_features = np.append(trn_features, dev_features,axis=0)
trndev_labels = np.append(trn_labels,dev_labels)
trndev_names = np.append(trn_names,dev_names)

In [6]:
#language modeling
lang_mean=[]
for i, lang in enumerate(languages):
#     lang_mean.append(np.mean(np.append(trn_features[np.nonzero(trndev_labels == i+1)] ,dev_features[np.nonzero(dev_labels == i+1)],axis=0),axis=0))
    lang_mean.append(np.mean( trn_features[np.nonzero(trn_labels == i+1)][:],axis=0 ) )

lang_mean = np.array(lang_mean)
lang_mean = it.length_norm(lang_mean)

print( np.shape(trn_features), np.shape(dev_features), np.shape(lang_mean),np.shape(tst_features) )

((14000, 41657), (1524, 41657), (5, 41657), (1492, 41657))


In [7]:
# Baseline performance on TST using CDS
tst_scores = lang_mean.dot(tst_features.transpose() )
# print(tst_scores.shape)
hypo_lang = np.argmax(tst_scores,axis = 0)
temp = ((tst_labels-1) - hypo_lang)
acc =1- np.size(np.nonzero(temp)) / float(np.size(tst_labels))
print 'Baseline accurary on test dataset : %0.3f' %(acc)

# Baseline performance on DEV using CDS
dev_scores = lang_mean.dot(dev_features.transpose() )
hypo_lang = np.argmax(dev_scores,axis = 0)
temp = ((dev_labels-1) - hypo_lang)
acc =1- np.size(np.nonzero(temp)) / float(np.size(dev_labels))
print 'Baseline accurary on dev dataset : %0.3f' %(acc)


Baseline accurary on test dataset : 0.379
Baseline accurary on dev dataset : 0.371


In [8]:
# making pair of train i-vector with mean of each language i-vector
#  example : for total 3 ivectors
#  ivector   ivector_p  label
#     1         1         1
#     1         2         0
#     1         3         0
#     2         1         0
#     2         2         1
#     ...      ...       ...
#     3         3         1

# preparing pair labels
sim = []
pair_a_idx = []
pair_b_idx = []
for i, lang in enumerate(languages):
    for j, label in enumerate(trn_labels):
#         print i, j, label
        pair_a_idx.append(i+1)
        pair_b_idx.append(j)
        if i+1 == label:
            sim.append(1)
        else:
            sim.append(0)
print(np.shape(pair_a_idx),np.shape(pair_b_idx), np.shape(sim))
pair_a_idx=np.array(pair_a_idx)
pair_b_idx=np.array(pair_b_idx)
sim = np.array(sim)

#shuffling
shuffleidx = np.arange(0,np.size(pair_a_idx))
np.random.shuffle(shuffleidx)
pair_a_idx = pair_a_idx[shuffleidx]
pair_b_idx = pair_b_idx[shuffleidx]
sim = sim[shuffleidx]


data = []
data_p = []
    
for iter in np.arange(0,np.size(sim)) :
    data.append( lang_mean[pair_a_idx[iter]-1] )
    data_p.append( trn_features[pair_b_idx[iter]] )
data = np.array(data)
data_p = np.array(data_p)

# TRN dataset loading for feeding 
tar_data = data[sim==1]
tar_data_p = data_p[sim==1]
tar_sim = sim[sim==1]
non_data = data[sim==0]
non_data_p = data_p[sim==0]
non_sim = sim[sim==0]
print(tar_data.shape, tar_data_p.shape,tar_sim.shape,non_data.shape,non_data_p.shape,non_sim.shape)

trn_tar = ivector_dataset.DataSet(tar_data,tar_sim)
trn_tar_p = ivector_dataset.DataSet(tar_data_p,tar_sim)

trn_non = ivector_dataset.DataSet(non_data,non_sim)
trn_non_p = ivector_dataset.DataSet(non_data_p,non_sim)


((70000,), (70000,), (70000,))
((14000, 41657), (14000, 41657), (14000,), (56000, 41657), (56000, 41657), (56000,))


In [9]:
# preparing pair labels of DEV dataset
sim = []
pair_a_idx = []
pair_b_idx = []
for i, lang in enumerate(languages):
    for j, label in enumerate(dev_labels):
#         print i, j, label
        pair_a_idx.append(i+1)
        pair_b_idx.append(j)
        if i+1 == label:
            sim.append(1)
        else:
            sim.append(0)
print(np.shape(pair_a_idx),np.shape(pair_b_idx), np.shape(sim))
pair_a_idx=np.array(pair_a_idx)
pair_b_idx=np.array(pair_b_idx)
sim = np.array(sim)

#shuffling
shuffleidx = np.arange(0,np.size(pair_a_idx))
np.random.shuffle(shuffleidx)
pair_a_idx = pair_a_idx[shuffleidx]
pair_b_idx = pair_b_idx[shuffleidx]
sim = sim[shuffleidx]


data = []
data_p = []
    
for iter in np.arange(0,np.size(sim)) :
    data.append( lang_mean[pair_a_idx[iter]-1] )
    data_p.append( dev_features[pair_b_idx[iter]] )
data = np.array(data)
data_p = np.array(data_p)

# DEV dataset loading for feeding 
tar_data = data[sim==1]
tar_data_p = data_p[sim==1]
tar_sim = sim[sim==1]
non_data = data[sim==0]
non_data_p = data_p[sim==0]
non_sim = sim[sim==0]
print(tar_data.shape, tar_data_p.shape,tar_sim.shape,non_data.shape,non_data_p.shape,non_sim.shape)

dev_tar = ivector_dataset.DataSet(tar_data,tar_sim)
dev_tar_p = ivector_dataset.DataSet(tar_data_p,tar_sim)

dev_non = ivector_dataset.DataSet(non_data,non_sim)
dev_non_p = ivector_dataset.DataSet(non_data_p,non_sim)


((7620,), (7620,), (7620,))
((1524, 41657), (1524, 41657), (1524,), (6096, 41657), (6096, 41657), (6096,))


In [10]:
# init variables
sess = tf.InteractiveSession()
siamese = siamese_model.siamese(np.shape(trn_features)[1]);
global_step = tf.Variable(0, trainable=False)
learning_rate = tf.train.exponential_decay(0.01, global_step,
                                           5000, 0.99, staircase=True)
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(siamese.loss, global_step=global_step)
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

(?, 92600)
(?, 1500)
(?, 600)
(?, 92600)
(?, 1500)
(?, 600)


In [11]:
#start training
batch_size = 40
max_acc = 0.0
max_step=0
saver_folder='snnmodel_words'
if not os.path.exists(saver_folder):
    os.mkdir(saver_folder)
for step in range(100000):
  
    if step %5 ==0:
        batch_x1_a, batch_y1_a = trn_tar.next_batch(batch_size,shuffle=False)
        batch_x2_a, batch_y2_a = trn_tar_p.next_batch(batch_size,shuffle=False)
        batch_x1_b, batch_y1_b = trn_non.next_batch(batch_size,shuffle=False)
        batch_x2_b, batch_y2_b = trn_non_p.next_batch(batch_size,shuffle=False)
        batch_x1 = np.append(batch_x1_a,batch_x1_b,axis=0)
        batch_y1 = np.append(batch_y1_a,batch_y1_b,axis=0)
        batch_x2 = np.append(batch_x2_a,batch_x2_b,axis=0)
        batch_y2 = np.append(batch_y2_a,batch_y2_b,axis=0)
    else:
        batch_x1_a, batch_y1_a = dev_tar.next_batch(batch_size,shuffle=False)
        batch_x2_a, batch_y2_a = dev_tar_p.next_batch(batch_size,shuffle=False)
        batch_x1_b, batch_y1_b = dev_non.next_batch(batch_size,shuffle=False)
        batch_x2_b, batch_y2_b = dev_non_p.next_batch(batch_size,shuffle=False)
        batch_x1 = np.append(batch_x1_a,batch_x1_b,axis=0)
        batch_y1 = np.append(batch_y1_a,batch_y1_b,axis=0)
        batch_x2 = np.append(batch_x2_a,batch_x2_b,axis=0)
        batch_y2 = np.append(batch_y2_a,batch_y2_b,axis=0)
        
    batch_y = batch_y1*2-1
    
    _, loss_v = sess.run([train_step, siamese.loss], feed_dict={
        siamese.x1: batch_x1,
        siamese.x2: batch_x2,
        siamese.y_: batch_y
    })
    
    if np.isnan(loss_v):
        print ('Model diverged with loss = NAN')
        quit()
        
    if step % 100 ==0:
        dev_features_siam = siamese.o1.eval({siamese.x1:dev_features})
        lang_mean_siam = siamese.o1.eval({siamese.x1:lang_mean})
        tst_features_siam = siamese.o1.eval({siamese.x1:tst_features})
        
        dev_scores = lang_mean_siam.dot(dev_features_siam.transpose() )
        hypo_lang = np.argmax(dev_scores,axis = 0)
        temp = ((dev_labels-1) - hypo_lang)
        acc =1- np.size(np.nonzero(temp)) / float( np.size(dev_labels) )
        
        tst_scores = lang_mean_siam.dot(tst_features_siam.transpose() )
        hypo_lang = np.argmax(tst_scores,axis = 0)
        temp = ((tst_labels-1) - hypo_lang)
        acc_tst =1- np.size(np.nonzero(temp)) / float(np.size(tst_labels))

        if max_acc < acc_tst:
            max_acc = acc_tst
            max_step=step
            print ('Step %d: loss %f, Acc.: (DEV)%.3f (TST)%.3f, lr : %.5f' % (step,loss_v,acc,acc_tst,sess.run(learning_rate)))
            saver.save(sess, saver_folder+'/model'+str(step)+'.ckpt')
        if max_step-step <0.5:
            break


Step 0: loss 1.000016, Acc.: (DEV)0.244 (TST)0.227, lr : 0.01000
Step 200: loss 1.000010, Acc.: (DEV)0.266 (TST)0.248, lr : 0.01000
Step 400: loss 0.999979, Acc.: (DEV)0.272 (TST)0.255, lr : 0.01000
Step 500: loss 0.999965, Acc.: (DEV)0.289 (TST)0.257, lr : 0.01000
Step 600: loss 0.999990, Acc.: (DEV)0.291 (TST)0.267, lr : 0.01000
Step 700: loss 1.000027, Acc.: (DEV)0.293 (TST)0.269, lr : 0.01000
Step 800: loss 0.999997, Acc.: (DEV)0.304 (TST)0.280, lr : 0.01000
Step 900: loss 0.999997, Acc.: (DEV)0.314 (TST)0.284, lr : 0.01000
Step 1500: loss 0.999953, Acc.: (DEV)0.305 (TST)0.288, lr : 0.01000
Step 1700: loss 0.999933, Acc.: (DEV)0.314 (TST)0.289, lr : 0.01000
Step 2300: loss 0.999953, Acc.: (DEV)0.313 (TST)0.292, lr : 0.01000
Step 14200: loss 0.999205, Acc.: (DEV)0.295 (TST)0.293, lr : 0.00980
Step 14600: loss 0.998951, Acc.: (DEV)0.299 (TST)0.296, lr : 0.00980
Step 15000: loss 0.998830, Acc.: (DEV)0.299 (TST)0.298, lr : 0.00970
Step 17000: loss 1.000467, Acc.: (DEV)0.302 (TST)0.298,

In [14]:
print max_step
RESTORE_STEP=max_step
saver.restore(sess, saver_folder+'/model'+str(RESTORE_STEP)+'.ckpt')


# trn_features_siam = siamese.o1.eval({siamese.x1:trn_features})
dev_features_siam = siamese.o1.eval({siamese.x1:dev_features})
tst_features_siam = siamese.o1.eval({siamese.x1:tst_features})
lang_mean_siam = siamese.o1.eval({siamese.x1:lang_mean})

tst_scores = lang_mean_siam.dot(tst_features_siam.transpose() )
# print(tst_scores.shape)
hypo_lang = np.argmax(tst_scores,axis = 0)
temp = ((tst_labels-1) - hypo_lang)
acc =1- np.size(np.nonzero(temp)) / float(np.size(tst_labels))
print 'Final accurary on test dataset : %0.3f' %(acc)

60400
INFO:tensorflow:Restoring parameters from snnmodel_words/model60400.ckpt
Final accurary on test dataset : 0.585


In [15]:
confusionmat = np.zeros((5,5))
for i,lang in enumerate(languages):
    hypo_bylang = hypo_lang[ tst_labels == i+1]
    hist_bylang = np.histogram(hypo_bylang,5)
    confusionmat[:,i] = hist_bylang[0]

precision = np.diag(confusionmat) / np.sum(confusionmat,axis=1) #precision
recall = np.diag(confusionmat) / np.sum(confusionmat,axis=0) # recall
    
print 'Confusion matrix'
print confusionmat
print 'Precision'
print precision
print 'Recall'
print recall

print '\n\n<Performance evaluation on Test dataset>'
print 'Accurary  : %0.3f' %(acc)
print 'Precision : %0.3f' %(np.mean(precision))
print 'Recall    : %0.3f' %(np.mean(recall))

Confusion matrix
[[ 179.   31.   38.   15.   29.]
 [  24.  119.   44.   28.   56.]
 [  54.   41.  208.   23.   47.]
 [  16.   31.   17.  184.   29.]
 [  29.   28.   27.   12.  183.]]
Precision
[ 0.6130137   0.43911439  0.55764075  0.66425993  0.65591398]
Recall
[ 0.59271523  0.476       0.62275449  0.70229008  0.53197674]


<Performance evaluation on Test dataset>
Accurary  : 0.585
Precision : 0.586
Recall    : 0.585
