In [1]:
from tensorflow.contrib.learn.python.learn.datasets import base

import tensorflow as tf
import numpy as np
import os,sys
sys.path.insert(0, './scripts')
dataDir ='./data'


import py_compile
py_compile.compile('scripts/ivector_dataset.py')
py_compile.compile('scripts/ivector_tools.py')
py_compile.compile('scripts/siamese_model.py')
import ivector_dataset
import siamese_model
import ivector_tools as it

In [2]:
# write prototxt for siamese network

languages = ['EGY','GLF','LAV','MSA','NOR']
trn_labels = []
trn_names = []
trn_ivectors = np.empty((0,400))
dev_labels = []
dev_names = []
dev_ivectors = np.empty((0,400))


for i,lang in enumerate(languages):
    #load train.vardial2017
    filename = dataDir+'/train.vardial2017/%s.ivec' % lang
    name   = np.loadtxt(filename,usecols=[0],dtype='string')
    ivector = np.loadtxt(filename,usecols=range(1,401),dtype='float32')
    trn_labels = np.append(trn_labels, np.ones(np.size(name))*(i+1))
    trn_names=np.append(trn_names,name)
    trn_ivectors = np.append(trn_ivectors, ivector,axis=0)

    #load dev.vardial2017
    filename = dataDir+'/dev.vardial2017/%s.ivec' % lang
    name   = np.loadtxt(filename,usecols=[0],dtype='string')
    ivector = np.loadtxt(filename,usecols=range(1,401),dtype='float32')
    dev_names=np.append(dev_names,name)
    dev_ivectors = np.append(dev_ivectors, ivector,axis=0)
    dev_labels = np.append(dev_labels, np.ones(np.size(name))*(i+1))
    
# load test.MGB3
filename = dataDir+'/test.MGB3/ivec_features'
tst_names   = np.loadtxt(filename,usecols=[0],dtype='string')
tst_ivectors = np.loadtxt(filename,usecols=range(1,401),dtype='float32')

# merge trn+dev
trndev_ivectors = np.append(trn_ivectors, dev_ivectors,axis=0)
trndev_labels = np.append(trn_labels,dev_labels)
trndev_name = np.append(trn_names,dev_names)


# load tst.MGB3 labels
filename = 'data/test.MGB3/reference'
tst_ref_names = np.loadtxt(filename,usecols=[0],dtype='string')
tst_ref_labels = np.loadtxt(filename,usecols=[1],dtype='int')

tst_ref_labels_index = []
for i, name_ref in enumerate(tst_ref_names):
    for j, name in enumerate(tst_names):
        if name == name_ref:
            tst_ref_labels_index = np.append(tst_ref_labels_index,int(j))

tst_labels = tst_ref_labels
tst_ivectors = tst_ivectors[ map(int,tst_ref_labels_index),:]

In [3]:
#center and length norm.
m=np.mean(trn_ivectors,axis=0)
A = np.cov(trn_ivectors.transpose())
[a,D,V] = np.linalg.svd(A)
V= V.transpose()
W= np.dot(V, np.diag(1./( np.sqrt(D) + 0.0000000001 )))

trn_ivectors = np.dot( np.subtract( trn_ivectors, m), W)
trndev_ivectors = np.dot( np.subtract( trndev_ivectors, m), W)
dev_ivectors = np.dot( np.subtract( dev_ivectors, m), W)
tst_ivectors = np.dot( np.subtract( tst_ivectors, m), W)

trn_ivectors = it.length_norm(trn_ivectors)
trndev_ivectors = it.length_norm(trndev_ivectors)
dev_ivectors = it.length_norm(dev_ivectors)
tst_ivectors = it.length_norm(tst_ivectors)

#language modeling
lang_mean=[]
for i, lang in enumerate(languages):
    lang_mean.append(np.mean(np.append(trn_ivectors[np.nonzero(trn_labels == i+1)] ,dev_ivectors[np.nonzero(dev_labels == i+1)],axis=0),axis=0))
#    lang_mean.append(np.mean(trn_ivectors[np.nonzero(trn_labels == i+1)],axis=0))

lang_mean = np.array(lang_mean)
lang_mean = it.length_norm(lang_mean)

print( np.shape(trn_ivectors), np.shape(dev_ivectors), np.shape(lang_mean),np.shape(tst_ivectors) )


tst_scores = lang_mean.dot(tst_ivectors.transpose() )
# print(tst_scores.shape)
hypo_lang = np.argmax(tst_scores,axis = 0)
temp = ((tst_labels-1) - hypo_lang)
acc =1- np.size(np.nonzero(temp)) / float(np.size(tst_labels))
print 'Final accurary on test dataset : %0.3f' %(acc)

confusionmat = np.zeros((5,5))
for i,lang in enumerate(languages):
    hypo_bylang = hypo_lang[ tst_labels == i+1]
    hist_bylang = np.histogram(hypo_bylang,5)
    confusionmat[:,i] = hist_bylang[0]

precision = np.diag(confusionmat) / np.sum(confusionmat,axis=1) #precision
recall = np.diag(confusionmat) / np.sum(confusionmat,axis=0) # recall
    
print 'Confusion matrix'
print confusionmat
print 'Precision'
print precision
print 'Recall'
print recall

print '\n\n<Performance evaluation on Test dataset : CDS (baseline) >'
print 'Accurary  : %0.3f' %(acc)
print 'Precision : %0.3f' %(np.mean(precision))
print 'Recall    : %0.3f' %(np.mean(recall))

((13825, 400), (1524, 400), (5, 400), (1492, 400))
Final accurary on test dataset : 0.603
Confusion matrix
[[ 192.   14.   40.   10.   46.]
 [  15.  118.   34.    8.   20.]
 [  65.   83.  221.   16.  102.]
 [  23.   28.   24.  225.   32.]
 [   7.    7.   15.    3.  144.]]
Precision
[ 0.63576159  0.60512821  0.45379877  0.67771084  0.81818182]
Recall
[ 0.63576159  0.472       0.66167665  0.85877863  0.41860465]


<Performance evaluation on Test dataset : CDS (baseline) >
Accurary  : 0.603
Precision : 0.638
Recall    : 0.609


In [4]:
#LDA
[languages,train_languages_num] = np.unique(trndev_labels,return_inverse=True)
V = it.lda2(trndev_ivectors,train_languages_num)
V = np.real(V[:,0:4])
trn_ivectors = np.matmul(trn_ivectors,V)
dev_ivectors = np.matmul(dev_ivectors,V)
tst_ivectors = np.matmul(tst_ivectors,V)
trndev_ivectors = np.matmul(trndev_ivectors,V)



trn_ivectors = it.length_norm(trn_ivectors)
trndev_ivectors = it.length_norm(trndev_ivectors)
dev_ivectors = it.length_norm(dev_ivectors)
tst_ivectors = it.length_norm(tst_ivectors)


#language modeling
lang_mean=[]
for i, lang in enumerate(languages):
    lang_mean.append(np.mean(np.append(trn_ivectors[np.nonzero(trn_labels == i+1)] ,dev_ivectors[np.nonzero(dev_labels == i+1)],axis=0),axis=0))
#    lang_mean.append(np.mean(trn_ivectors[np.nonzero(trn_labels == i+1)],axis=0))

lang_mean = np.array(lang_mean)
lang_mean = it.length_norm(lang_mean)

print( np.shape(trn_ivectors), np.shape(dev_ivectors), np.shape(lang_mean),np.shape(tst_ivectors) )


tst_scores = lang_mean.dot(tst_ivectors.transpose() )
# print(tst_scores.shape)
hypo_lang = np.argmax(tst_scores,axis = 0)
temp = ((tst_labels-1) - hypo_lang)
acc =1- np.size(np.nonzero(temp)) / float(np.size(tst_labels))
print 'Final accurary on test dataset : %0.3f' %(acc)

confusionmat = np.zeros((5,5))
for i,lang in enumerate(languages):
    hypo_bylang = hypo_lang[ tst_labels == i+1]
    hist_bylang = np.histogram(hypo_bylang,5)
    confusionmat[:,i] = hist_bylang[0]

precision = np.diag(confusionmat) / np.sum(confusionmat,axis=1) #precision
recall = np.diag(confusionmat) / np.sum(confusionmat,axis=0) # recall
    
print 'Confusion matrix'
print confusionmat
print 'Precision'
print precision
print 'Recall'
print recall

print '\n\n<Performance evaluation on Test dataset : LDA+CDS>'
print 'Accurary  : %0.3f' %(acc)
print 'Precision : %0.3f' %(np.mean(precision))
print 'Recall    : %0.3f' %(np.mean(recall))

((13825, 4), (1524, 4), (5, 4), (1492, 4))
Final accurary on test dataset : 0.628
Confusion matrix
[[ 200.   22.   46.   13.   40.]
 [  17.  145.   62.   10.   27.]
 [  47.   49.  172.    9.   54.]
 [  22.   23.   26.  224.   27.]
 [  16.   11.   28.    6.  196.]]
Precision
[ 0.62305296  0.55555556  0.51963746  0.69565217  0.76264591]
Recall
[ 0.66225166  0.58        0.51497006  0.85496183  0.56976744]


<Performance evaluation on Test dataset : LDA+CDS>
Accurary  : 0.628
Precision : 0.631
Recall    : 0.636
