In [None]:
import numpy as np
import pandas as pd
from copy import deepcopy
from collections import defaultdict
import random
from sklearn.utils import shuffle, class_weight
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
from sklearn.metrics import balanced_accuracy_score
import itertools
from keras.models import model_from_json
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils import np_utils
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Dropout, Activation, Flatten
from keras.layers import Conv1D, MaxPooling1D, GlobalAveragePooling1D, LSTM, GRU, RNN, CuDNNGRU, CuDNNLSTM, Bidirectional
from keras import backend as K
from keras import regularizers
import gensim.downloader

**Loading the word vectors** 

In [None]:
glove_vectors = gensim.downloader.load('glove-twitter-25')



In [None]:
fix_data = pd.read_csv('/content/drive/MyDrive/18sat_fixfinal.csv')

In [None]:
fix_data

Unnamed: 0,RECORDING_SESSION_LABEL,TRIAL_INDEX,CURRENT_FIX_X,CURRENT_FIX_Y,CURRENT_FIX_PUPIL,CURRENT_FIX_DURATION,CURRENT_FIX_INTEREST_AREA_ID,CURRENT_FIX_INTEREST_AREA_LABEL,CURRENT_FIX_INTEREST_AREA_PIXEL_AREA,CURRENT_FIX_INTEREST_AREA_RUN_ID,...,Trial_Recycled_,total_page,type,book_name,book,page,RT,answer,correct_answer,page_name
0,msd001,1,59.8,125.4,1430.0,22,,,,,...,True,5,reading,dickens,1,1,25094.538413,1,-99,reading-dickens-1
1,msd001,1,348.7,182.0,1375.0,26,24.0,long,3520.0,1.0,...,True,5,reading,dickens,1,1,25094.538413,1,-99,reading-dickens-1
2,msd001,1,630.5,400.3,1365.0,216,72.0,safe,3136.0,1.0,...,True,5,reading,dickens,1,1,25094.538413,1,-99,reading-dickens-1
3,msd001,1,492.0,400.2,1440.0,125,69.0,boundless,7488.0,1.0,...,True,5,reading,dickens,1,1,25094.538413,1,-99,reading-dickens-1
4,msd001,1,526.6,390.5,1265.0,486,70.0,"world,",4992.0,1.0,...,True,5,reading,dickens,1,1,25094.538413,1,-99,reading-dickens-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463559,msd107,74,530.1,375.0,1337.0,82,,,,,...,False,13,question,flytrap,2,13,795.171013,3,-99,question-flytrap-13
463560,msd107,74,582.4,361.0,1349.0,161,,,,,...,False,13,question,flytrap,2,13,795.171013,3,-99,question-flytrap-13
463561,msd107,74,601.9,380.3,1320.0,236,,,,,...,False,13,question,flytrap,2,13,795.171013,3,-99,question-flytrap-13
463562,msd107,74,579.6,394.6,1304.0,46,,,,,...,False,13,question,flytrap,2,13,795.171013,3,-99,question-flytrap-13


In [None]:
datacols = ['CURRENT_FIX_X', 'CURRENT_FIX_Y', 'CURRENT_FIX_PUPIL', 'CURRENT_FIX_DURATION','CURRENT_FIX_INTEREST_AREA_LABEL']

In [None]:
fix_labels = pd.read_csv('/content/drive/MyDrive/18sat_labels.csv')

In [None]:
fix_labels

Unnamed: 0,subj,book,acc,confidence,difficulty,familiarity,recognition,interest,pressured,sleepiness,sleephours,acc_level,subj_acc,language,sex,native,subj_acc_level
0,msd001,dickens,0.6,2,0,1,0,2,0,1,2,2,0.75,English,F,1,3
1,msd001,flytrap,0.8,2,1,2,0,2,1,1,2,3,0.75,English,F,1,3
2,msd001,genome,0.8,1,0,1,0,2,1,2,2,3,0.75,English,F,1,3
3,msd001,northpole,0.8,1,1,1,0,1,1,2,2,3,0.75,English,F,1,3
4,msd002,dickens,0.6,2,1,1,0,2,1,1,2,2,0.45,English,M,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,msd106,northpole,0.6,0,2,0,0,0,1,1,2,2,0.55,English,M,1,1
376,msd107,dickens,0.6,3,0,2,0,2,0,0,1,2,0.65,English,M,1,2
377,msd107,flytrap,0.8,1,1,1,0,2,1,1,1,3,0.65,English,M,1,2
378,msd107,genome,0.4,1,2,0,0,2,1,1,1,1,0.65,English,M,1,2


**PREPROCESSING THE LABELS**

In [None]:
# Preprocessing the labels 
labelcols = ['subj', 'book',
            'acc_level', 'subj_acc_level', 
            'confidence', 'difficulty', 'familiarity', 'recognition', 
            'interest', 'pressured', 'sleepiness', 'sleephours',
            'sex', 'native']

fix_labels = fix_labels[labelcols]

fix_labels['sex'] = fix_labels['sex'].replace(['F', 'M'], [1,0])

binarycols = ('recognition', 'sex', 'native')

subsetcols = [c for c in labelcols if c not in binarycols]

fix_labels[subsetcols] = fix_labels[subsetcols].replace([0,1,2,3], [0,0,1,1])

## frequency table per column
for column in fix_labels:
    print(fix_labels[column].value_counts(sort=False, dropna=False), '\n')

msd001    4
msd002    4
msd003    4
msd004    4
msd005    4
         ..
msd102    4
msd103    4
msd105    4
msd106    4
msd107    4
Name: subj, Length: 95, dtype: int64 

dickens      95
flytrap      95
genome       95
northpole    95
Name: book, dtype: int64 

1    231
0    149
Name: acc_level, dtype: int64 

1    180
0    200
Name: subj_acc_level, dtype: int64 

1    161
0    219
Name: confidence, dtype: int64 

0    252
1    128
Name: difficulty, dtype: int64 

0    291
1     89
Name: familiarity, dtype: int64 

0    373
1      7
Name: recognition, dtype: int64 

1    230
0    150
Name: interest, dtype: int64 

0    250
1    130
Name: pressured, dtype: int64 

0    233
1    147
Name: sleepiness, dtype: int64 

1    200
0    180
Name: sleephours, dtype: int64 

1    252
0    128
Name: sex, dtype: int64 

1    264
0    116
Name: native, dtype: int64 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [None]:
def preprocess_word(word):
  word = word.lower()
  new = ""
  for c in word:
    if(c.isalpha()):
      new += c
  return new

**Converting the eye tracking data into fixation windows**

In [None]:
## preprocessing to window
delta = 10
step = 2*delta+1
def group_windows(fixationRows):
    windows = []
    fixationRows.reset_index(inplace=True)
    for n in range(delta, len(fixationRows)-delta, step):
        window = fixationRows.loc[n-delta:n+delta,datacols].values.tolist()
        for i in range(len(window)):
          win = window[i]
          word = preprocess_word(str(win[4]))
          if word not in glove_vectors.vocab.keys():
            word_vec = [0 for i in range(25)]
          else:
            word_vec = glove_vectors[word]
          window[i] = list(win[0:4])+list(word_vec)
        windows.append(window)
    return windows

In [None]:
# This function is used to generate the window data from the eye tracking SAT dataset
def generate_windata(fixation):
    subjectPool = pd.unique(fixation['RECORDING_SESSION_LABEL'])
    pagePool = pd.unique(fixation['page_name'])
    windowData = {}
    for subject in subjectPool:
        subjectRows = fixation.loc[fixation['RECORDING_SESSION_LABEL'] == subject]
        windowData[subject] = {}
        #print("\rprocessing Subject: " + subject, end='')
        for page in pagePool:
            # print ("Subject: " + subject + ", Page: " + page)
            pageRows = subjectRows.loc[subjectRows['page_name'] == page]
            # visualize_article(article, subjectRows)
            windows = group_windows(pageRows)
            windowData[subject][page] = windows
    print ("window data ready")
    return windowData

In [None]:
## This function is used to create the dataset
def create_dataset(windowData, sc):
    dataset = []
    index= []
    labeldf = pd.DataFrame()

    for subject in windowData:
        for article in windowData[subject]:
            windows = windowData[subject][article]
            for window in windows:
                dataset.append(window)
                book = article.split('-')[1] # article = 'reading-dickens-1'
                row = fix_labels[(fix_labels['subj'] == subject) & (fix_labels['book'] == book)]
                labeldf = pd.concat([labeldf, row]) 
    return np.array(dataset), labeldf 

In [None]:
# We only want eye tracking data while the users read the text 
# We take only eye tracking data for reading type
fix_data1 = fix_data.loc[fix_data.type == 'reading']
fix_data1.head()

Unnamed: 0,RECORDING_SESSION_LABEL,TRIAL_INDEX,CURRENT_FIX_X,CURRENT_FIX_Y,CURRENT_FIX_PUPIL,CURRENT_FIX_DURATION,CURRENT_FIX_INTEREST_AREA_ID,CURRENT_FIX_INTEREST_AREA_LABEL,CURRENT_FIX_INTEREST_AREA_PIXEL_AREA,CURRENT_FIX_INTEREST_AREA_RUN_ID,...,Trial_Recycled_,total_page,type,book_name,book,page,RT,answer,correct_answer,page_name
0,msd001,1,59.8,125.4,1430.0,22,,,,,...,True,5,reading,dickens,1,1,25094.538413,1,-99,reading-dickens-1
1,msd001,1,348.7,182.0,1375.0,26,24.0,long,3520.0,1.0,...,True,5,reading,dickens,1,1,25094.538413,1,-99,reading-dickens-1
2,msd001,1,630.5,400.3,1365.0,216,72.0,safe,3136.0,1.0,...,True,5,reading,dickens,1,1,25094.538413,1,-99,reading-dickens-1
3,msd001,1,492.0,400.2,1440.0,125,69.0,boundless,7488.0,1.0,...,True,5,reading,dickens,1,1,25094.538413,1,-99,reading-dickens-1
4,msd001,1,526.6,390.5,1265.0,486,70.0,"world,",4992.0,1.0,...,True,5,reading,dickens,1,1,25094.538413,1,-99,reading-dickens-1


**Normalising the data before training**

In [None]:
datanorm = True
numcols = ['CURRENT_FIX_X','CURRENT_FIX_Y','CURRENT_FIX_PUPIL','CURRENT_FIX_DURATION']
if datanorm:
    fixData = fix_data1.copy(deep=True)
    fixData[numcols] = (fix_data1[numcols]-fix_data1[numcols].min())/(fix_data1[numcols].max()-fix_data1[numcols].min())
else:
    fixData = fix_data1.copy(deep=True)

**Generating the window data**

In [None]:
# Generating the window data using the abve functions
windowData = generate_windata(fixData)

window data ready


**Splitting the data into train test and validation datasets**

In [None]:
## data split
datasplit = 'book'

if datasplit == 'subject':
    # subject-wise dataset split  (subject wise)
    # Here subject means readers,i.e predicting the new readers comprehension after training with the fixation windows of other
    # readers
    # current plan is to use 60:20:20 
    subjkeys = list(windowData.keys())
    random.Random(23).shuffle(subjkeys) #random shuffling
    N_totalsub = len(subjkeys)
    N_trainsub = round(0.6*N_totalsub)
    N_validsub = round(0.2*N_totalsub)
    N_testsub = N_totalsub - N_trainsub - N_validsub

    windowData_train = deepcopy(windowData)
    windowData_valid = {}
    windowData_test = {}

    for i, subj in enumerate(subjkeys):
        if i in range(N_validsub):
            #print(subj, 'to valid')
            windowData_valid[subj] = windowData_train[subj]
            del windowData_train[subj]
        elif i in range(N_validsub, N_validsub + N_testsub):
            #print(subj, 'to test')
            windowData_test[subj] = windowData_train[subj]
            del windowData_train[subj]

    print("train subj #", len(list(windowData_train.keys())))
    print("valid subj #", len(list(windowData_valid.keys())))
    print("test subj #", len(list(windowData_test.keys())))

    ## create dataset
    X_train, labels_train = create_dataset(windowData_train, fix_labels)
    X_valid, labels_valid = create_dataset(windowData_valid, fix_labels)
    X_test, labels_test = create_dataset(windowData_test, fix_labels)
        
elif datasplit == 'record':
    X, labels = create_dataset(windowData, fix_labels)
    X_train, X_test, labels_train, labels_test = train_test_split(X, labels, test_size=0.4, random_state=23)
    X_valid, X_test, labels_valid, labels_test = train_test_split(X_test, labels_test, test_size=0.5, random_state=23)

elif datasplit == 'book':
    # book-wise dataset split  
    ## current plan is to use 50:25:25 (2,1,1)
    ##  predicting a reader’s comprehension for one unseen passage after training with that person’s reading behavior from the two other passages

    subjkeys = list(windowData.keys())
    pagekeys = list(windowData[subjkeys[0]].keys())
    bookkeys = list(np.unique(fix_labels['book'])) # ['dickens' 'flytrap' 'genome' 'northpole']
    print('list of books:', bookkeys)

    windowData_train = deepcopy(windowData)
    windowData_valid = defaultdict(dict)
    windowData_test = defaultdict(dict)

    for subj in subjkeys:
        tmp = random.sample(bookkeys,2)
        for page in pagekeys:
            if (page.split('-')[1] == tmp[0]):
                windowData_valid[subj][page] = windowData_train[subj][page]
                del windowData_train[subj][page]
                
            elif (page.split('-')[1] == tmp[1]): 
                windowData_test[subj][page] = windowData_train[subj][page]
                del windowData_train[subj][page]

    ## create dataset
    X_train, labels_train = create_dataset(windowData_train, fix_labels)
    X_valid, labels_valid = create_dataset(windowData_valid, fix_labels)
    X_test, labels_test = create_dataset(windowData_test, fix_labels)

    print("train book #", list(windowData_train['msd001'].keys()))
    print("valid book #", list(windowData_valid['msd001'].keys()))
    print("test book #", list(windowData_test['msd001'].keys()))

list of books: ['dickens', 'flytrap', 'genome', 'northpole']
train book # ['reading-flytrap-1', 'reading-flytrap-2', 'reading-flytrap-3', 'reading-flytrap-4', 'reading-flytrap-5', 'reading-flytrap-6', 'reading-genome-1', 'reading-genome-2', 'reading-genome-3', 'reading-genome-4', 'reading-genome-5', 'reading-genome-6']
valid book # ['reading-northpole-1', 'reading-northpole-2', 'reading-northpole-3', 'reading-northpole-4', 'reading-northpole-5']
test book # ['reading-dickens-1', 'reading-dickens-2', 'reading-dickens-3', 'reading-dickens-4', 'reading-dickens-5']


In [None]:
X_train[0]

array([[ 0.32670467,  0.38769279,  0.32254949,  0.07983303,  0.77278   ,
         0.46184   ,  0.96917999,  0.32804   ,  0.23409   ,  0.16616   ,
         0.48489001, -1.61450005, -0.45752001,  0.39432999,  0.63108999,
         0.091311  , -3.28209996,  1.23119998,  0.27107999, -0.35925999,
         0.64537001, -0.11048   ,  0.27353999, -0.588     , -0.38878   ,
        -0.87965   , -0.36660999, -1.19799995, -0.97115999],
       [ 0.32970715,  0.31021633,  0.32303235,  0.02426298,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.25812628,  0.2884277 ,  0.33220666,  0.03156796, -0.24081001,
        -0.25931001, -0.95393997, -0.26898   , -1.36319995, -0.073599  ,
  

In [None]:
pred_variable = 'difficulty'

if pred_variable == 'subj':
    ## labels as categorical
    y_train = labels_train[pred_variable].astype('category').cat.codes
    y_valid = labels_valid[pred_variable].astype('category').cat.codes
    y_test = labels_test[pred_variable].astype('category').cat.codes

else:
    ## labels as categorical
    y_train = labels_train[pred_variable]
    y_valid = labels_valid[pred_variable]
    y_test = labels_test[pred_variable]

In [None]:
X_train[0]

array([[ 0.32670467,  0.38769279,  0.32254949,  0.07983303,  0.77278   ,
         0.46184   ,  0.96917999,  0.32804   ,  0.23409   ,  0.16616   ,
         0.48489001, -1.61450005, -0.45752001,  0.39432999,  0.63108999,
         0.091311  , -3.28209996,  1.23119998,  0.27107999, -0.35925999,
         0.64537001, -0.11048   ,  0.27353999, -0.588     , -0.38878   ,
        -0.87965   , -0.36660999, -1.19799995, -0.97115999],
       [ 0.32970715,  0.31021633,  0.32303235,  0.02426298,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.25812628,  0.2884277 ,  0.33220666,  0.03156796, -0.24081001,
        -0.25931001, -0.95393997, -0.26898   , -1.36319995, -0.073599  ,
  

**DESCRIPTION OF THE DATA**

In [None]:
num_classes = len(pd.unique(y_train)) # labels_train[pred_variable].shape (TTTT,)

print("##### data description #####")
print("# of classes:\t",num_classes)

input_shape = X_train.shape[1:]
print("input shape is:\t",input_shape)

N_samples_train = X_train.shape[0]
print("# of samples for training is:\t", N_samples_train)

N_samples_valid = X_valid.shape[0]
print("# of samples for validation is:\t", N_samples_valid)

N_samples_test = X_test.shape[0]
print("# of samples for prediction is:\t", N_samples_test)

N_total = N_samples_train + N_samples_valid + N_samples_test
print("# of total samples:\t", N_total)

print("\n##### data imbalances #####")
print(y_train.value_counts(normalize=True).sort_index())

print("\n##### null acc for test dataset #####")
print(np.max(y_test.value_counts(normalize=True).sort_index()))


##### data description #####
# of classes:	 2
input shape is:	 (21, 29)
# of samples for training is:	 5843
# of samples for validation is:	 2831
# of samples for prediction is:	 2874
# of total samples:	 11548

##### data imbalances #####
0    0.643334
1    0.356666
Name: difficulty, dtype: float64

##### null acc for test dataset #####
0.6941544885177453


**MODEL TRAINING**

In [None]:
## model specify and compile
modeltype = 'rnn'
model = Sequential()

model.add(Bidirectional(LSTM(25, return_sequences = True),input_shape=input_shape)) 
model.add(Bidirectional(LSTM(25)))
model.add(Dropout(0.3))

model.add(Dense(50)) 
model.add(Activation('relu'))
#model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(30))
model.add(Activation('relu'))
model.add(Dropout(0.3))

model.add(Dense(10))
model.add(Activation('relu'))
model.add(Dropout(0.3))

model.add(Dense(2, activation='softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (None, 21, 50)           11000     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 50)               15200     
 nal)                                                            
                                                                 
 dropout (Dropout)           (None, 50)                0         
                                                                 
 dense (Dense)               (None, 50)                2550      
                                                                 
 activation (Activation)     (None, 50)                0         
                                                                 
 dropout_1 (Dropout)         (None, 50)                0

In [None]:
BATCH_SIZE = 100
EPOCHS = 100
model.compile(loss='sparse_categorical_crossentropy', optimizer= 'adam', metrics=['accuracy'])

hist = model.fit(X_train, y_train, batch_size=BATCH_SIZE,  epochs =EPOCHS,  verbose=2,
                 validation_data= (X_valid, y_valid), shuffle=True) 

Epoch 1/100
59/59 - 12s - loss: 0.6693 - accuracy: 0.6168 - val_loss: 0.6386 - val_accuracy: 0.6595 - 12s/epoch - 204ms/step
Epoch 2/100
59/59 - 3s - loss: 0.6432 - accuracy: 0.6437 - val_loss: 0.6256 - val_accuracy: 0.6595 - 3s/epoch - 48ms/step
Epoch 3/100
59/59 - 3s - loss: 0.6281 - accuracy: 0.6469 - val_loss: 0.6197 - val_accuracy: 0.6595 - 3s/epoch - 44ms/step
Epoch 4/100
59/59 - 3s - loss: 0.6219 - accuracy: 0.6536 - val_loss: 0.6115 - val_accuracy: 0.6503 - 3s/epoch - 45ms/step
Epoch 5/100
59/59 - 3s - loss: 0.6114 - accuracy: 0.6676 - val_loss: 0.6213 - val_accuracy: 0.6227 - 3s/epoch - 47ms/step
Epoch 6/100
59/59 - 3s - loss: 0.6024 - accuracy: 0.6803 - val_loss: 0.6093 - val_accuracy: 0.6535 - 3s/epoch - 45ms/step
Epoch 7/100
59/59 - 3s - loss: 0.6031 - accuracy: 0.6791 - val_loss: 0.6115 - val_accuracy: 0.6394 - 3s/epoch - 45ms/step
Epoch 8/100
59/59 - 3s - loss: 0.5972 - accuracy: 0.6878 - val_loss: 0.6123 - val_accuracy: 0.6362 - 3s/epoch - 47ms/step
Epoch 9/100
59/59 - 3

**TESTING THE MODEL ON TEST DATA**

In [None]:
pred = model.predict(X_test)

In [None]:
predictions = []
for p in pred:
  if(p[0]>p[1]):
    predictions.append(0)
  else:
    predictions.append(1)

In [None]:
y_test

0      0
0      0
0      0
0      0
0      0
      ..
376    0
376    0
376    0
376    0
376    0
Name: difficulty, Length: 2874, dtype: int64

In [None]:
print(accuracy_score(y_test,predictions))

0.6544885177453027


In [None]:
print('predicted variable : difficulty\n')
print('Test Accuracy :',accuracy_score(y_test,predictions),'\n')
print(classification_report(y_test,predictions))
print('Confusion Matrix:')
print(confusion_matrix(y_test,predictions))

predicted variable : difficulty

Test Accuracy : 0.6544885177453027 

              precision    recall  f1-score   support

           0       0.77      0.72      0.74      1995
           1       0.44      0.51      0.47       879

    accuracy                           0.65      2874
   macro avg       0.61      0.61      0.61      2874
weighted avg       0.67      0.65      0.66      2874

Confusion Matrix:
[[1437  558]
 [ 435  444]]


In [None]:
sns.set_style('white')


In [None]:
confusion_matrix(y_test,predictions)

array([[1437,  558],
       [ 435,  444]])

In [None]:
import gensim.downloader

In [None]:
glove_vectors = gensim.downloader.load('glove-twitter-25')



In [None]:
len(glove_vectors['most'])

25

In [None]:
len(glove_vectors['simple'])

25

In [None]:
glove_vectors.most_similar('k')

[('tb', 0.9281424283981323),
 ('p', 0.9069650173187256),
 ('c', 0.868862509727478),
 ('cm', 0.8653689622879028),
 ('sl', 0.8594229221343994),
 ('kk', 0.8575681447982788),
 ('s', 0.8554211854934692),
 ('e', 0.8499281406402588),
 ('ta', 0.8475498557090759),
 ('vo', 0.8453908562660217)]