In [1]:
import numpy as np 
from validation import compute_f1
from keras.models import Model
from keras.layers import TimeDistributed,Conv1D,Dense,Embedding,Input,Dropout,LSTM,Bidirectional,MaxPooling1D,Flatten,concatenate
from prepro import readfile,createBatches,createMatrices,iterate_minibatches,addCharInformatioin,padding
from keras.utils import plot_model,Progbar
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import RandomUniform
import sklearn.metrics

epochs = 50

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def tag_dataset(dataset):
    correctLabels = []
    predLabels = []
    b = Progbar(len(dataset))
    for i,data in enumerate(dataset):    
        tokens, casing, labels = data
        tokens = np.asarray([tokens])     
        casing = np.asarray([casing])
#         char = np.asarray([char])
        try:
            pred = model.predict([tokens, casing], verbose=False)[0]   
            pred = pred.argmax(axis=-1) #Predict the classes            
            correctLabels.append(labels)
            predLabels.append(pred)
        except Exception as e:
            continue
        b.update(i)
    return predLabels, correctLabels

def createMatrices(sentences, word2Idx, label2Idx, case2Idx):
    #{'numeric': 0, 'allLower': 1, 'contains_digit': 6, 'PADDING_TOKEN': 7, 'other': 4, 'allUpper': 2, 'mainly_numeric': 5, 'initialUpper': 3}
    unknownIdx = word2Idx['UNKNOWN_TOKEN']
    paddingIdx = word2Idx['PADDING_TOKEN']    
        
    dataset = []
    
    wordCount = 0
    unknownWordCount = 0
    
    for sentence in sentences:
        wordIndices = []    
        caseIndices = []
#         charIndices = []
        labelIndices = []
        
        for word,label in sentence:  
            wordCount += 1
            if word in word2Idx:
                wordIdx = word2Idx[word]
            elif word.lower() in word2Idx:
                wordIdx = word2Idx[word.lower()]                 
            else:
                wordIdx = unknownIdx
                unknownWordCount += 1
#             charIdx = []
#             for x in char:
#                 charIdx.append(char2Idx[x])
            #Get the label and map to int            
            wordIndices.append(wordIdx)
            caseIndices.append(getCasing(word, case2Idx))
#             charIndices.append(charIdx)
            labelIndices.append(label2Idx[label])
           
        dataset.append([wordIndices, caseIndices, labelIndices]) 
        
    return dataset

def getCasing(word, caseLookup):   
    casing = 'other'
    
    numDigits = 0
    for char in word:
        if char.isdigit():
            numDigits += 1
            
    digitFraction = numDigits / float(len(word))
    
    if word.isdigit(): #Is a digit
        casing = 'numeric'
    elif digitFraction > 0.5:
        casing = 'mainly_numeric'
    elif word.islower(): #All lower case
        casing = 'allLower'
    elif word.isupper(): #All upper case
        casing = 'allUpper'
    elif word[0].isupper(): #is a title, initial char upper, then all lower
        casing = 'initialUpper'
    elif numDigits > 0:
        casing = 'contains_digit'
    
   
    return caseLookup[casing]

def iterate_minibatches(dataset,batch_len): 
    start = 0
    for i in batch_len:
        tokens = []
        caseing = []
#         char = []
        labels = []
        data = dataset[start:i]
        start = i
        for dt in data:
            t,c,l = dt
            l = np.expand_dims(l,-1)
            tokens.append(t)
            caseing.append(c)
#             char.append(ch)
            labels.append(l)
        yield np.asarray(labels),np.asarray(tokens),np.asarray(caseing)

In [3]:
trainSentences = readfile("data/train.txt")
devSentences = readfile("data/valid.txt")
testSentences = readfile("data/test.txt")
print(trainSentences[0])

[['EU', 'B-ORG\n'], ['rejects', 'O\n'], ['German', 'B-MISC\n'], ['call', 'O\n'], ['to', 'O\n'], ['boycott', 'O\n'], ['British', 'B-MISC\n'], ['lamb', 'O\n'], ['.', 'O\n']]


In [4]:
# trainSentences = addCharInformatioin(trainSentences)
# devSentences = addCharInformatioin(devSentences)
# testSentences = addCharInformatioin(testSentences)

In [5]:
print(trainSentences[0])

[['EU', 'B-ORG\n'], ['rejects', 'O\n'], ['German', 'B-MISC\n'], ['call', 'O\n'], ['to', 'O\n'], ['boycott', 'O\n'], ['British', 'B-MISC\n'], ['lamb', 'O\n'], ['.', 'O\n']]


In [6]:
labelSet = set()
words = {}

for dataset in [trainSentences, devSentences, testSentences]:
    for sentence in dataset:
        for token,label in sentence:
            labelSet.add(label)
            words[token.lower()] = True

In [7]:
print(labelSet)

{'B-ORG\n', 'I-PER\n', 'B-MISC\n', 'I-LOC\n', 'O\n', 'B-LOC\n', 'B-PER\n', 'I-ORG\n', 'I-MISC\n'}


In [8]:
print(words)



In [9]:
# :: Create a mapping for the labels ::
label2Idx = {}
for label in labelSet:
    label2Idx[label] = len(label2Idx)

In [10]:
print(label2Idx)

{'B-ORG\n': 0, 'I-PER\n': 1, 'O\n': 4, 'B-PER\n': 6, 'B-LOC\n': 5, 'B-MISC\n': 2, 'I-MISC\n': 8, 'I-ORG\n': 7, 'I-LOC\n': 3}


In [11]:
# :: Hard coded case lookup ::
case2Idx = {'numeric': 0, 'allLower':1, 'allUpper':2, 'initialUpper':3, 'other':4, 'mainly_numeric':5, 'contains_digit': 6, 'PADDING_TOKEN':7}
caseEmbeddings = np.identity(len(case2Idx), dtype='float32')

In [12]:
print(caseEmbeddings)
print(case2Idx)

[[1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]]
{'contains_digit': 6, 'PADDING_TOKEN': 7, 'allLower': 1, 'numeric': 0, 'allUpper': 2, 'mainly_numeric': 5, 'other': 4, 'initialUpper': 3}


In [13]:
# :: Read in word embeddings ::
word2Idx = {}
wordEmbeddings = []

fEmbeddings = open("embeddings/glove.6B.100d.txt", encoding="utf-8")
# 6 billion words. Pretrained. Each with 100 dimensional word embedding. eg
'''
    the 0.04656 0.21318 -0.0074364 -0.45854 -0.035639 0.23643 -0.28836 0.21521 -0.13486 -1.6413 -0.26091 0.032434 0.056621 -0.043296 -0.021672 0.22476 -0.075129 -0.067018 -0.14247 0.038825 -0.18951 0.29977 0.39305 0.17887 -0.17343 -0.21178 0.23617 -0.063681 -0.42318 -0.11661 0.093754 0.17296 -0.33073 0.49112 -0.68995 -0.092462 0.24742 -0.17991 0.097908 0.083118 0.15299 -0.27276 -0.038934 0.54453 0.53737 0.29105 -0.0073514 0.04788 -0.4076 -0.026759 0.17919 0.010977 -0.10963 -0.26395 0.07399 0.26236 -0.1508 0.34623 0.25758 0.11971 -0.037135 -0.071593 0.43898 -0.040764 0.016425 -0.4464 0.17197 0.046246 0.058639 0.041499 0.53948 0.52495 0.11361 -0.048315 -0.36385 0.18704 0.092761 -0.11129 -0.42085 0.13992 -0.39338 -0.067945 0.12188 0.16707 0.075169 -0.015529 -0.19499 0.19638 0.053194 0.2517 -0.34845 -0.10638 -0.34692 -0.19024 -0.2004 0.12154 -0.29208 0.023353 -0.11618 -0.35768 0.062304 0.35884 0.02906 0.0073005 0.0049482 -0.15048 -0.12313 0.19337 0.12173 0.44503 0.25147 0.10781 -0.17716 0.038691 0.08153 0.14667 0.063666 0.061332 -0.075569 -0.37724 0.01585 -0.30342 0.28374 -0.042013 -0.040715 -0.15269 0.07498 0.15577 0.10433 0.31393 0.19309 0.19429 0.15185 -0.10192 -0.018785 0.20791 0.13366 0.19038 -0.25558 0.304 -0.01896 0.20147 -0.4211 -0.0075156 -0.27977 -0.19314 0.046204 0.19971 -0.30207 0.25735 0.68107 -0.19409 0.23984 0.22493 0.65224 -0.13561 -0.17383 -0.048209 -0.1186 0.0021588 -0.019525 0.11948 0.19346 -0.4082 -0.082966 0.16626 -0.10601 0.35861 0.16922 0.07259 -0.24803 -0.10024 -0.52491 -0.17745 -0.36647 0.2618 -0.012077 0.08319 -0.21528 0.41045 0.29136 0.30869 0.078864 0.32207 -0.041023 -0.1097 -0.092041 -0.12339 -0.16416 0.35382 -0.082774 0.33171 -0.24738 -0.048928 0.15746 0.18988 -0.026642 0.063315 -0.010673 0.34089 1.4106 0.13417 0.28191 -0.2594 0.055267 -0.052425 -0.25789 0.019127 -0.022084 0.32113 0.068818 0.51207 0.16478 -0.20194 0.29232 0.098575 0.013145 -0.10652 0.1351 -0.045332 0.20697 -0.48425 -0.44706 0.0033305 0.0029264 -0.10975 -0.23325 0.22442 -0.10503 0.12339 0.10978 0.048994 -0.25157 0.40319 0.35318 0.18651 -0.023622 -0.12734 0.11475 0.27359 -0.21866 0.015794 0.81754 -0.023792 -0.85469 -0.16203 0.18076 0.028014 -0.1434 0.0013139 -0.091735 -0.089704 0.11105 -0.16703 0.068377 -0.087388 -0.039789 0.014184 0.21187 0.28579 -0.28797 -0.058996 -0.032436 -0.0047009 -0.17052 -0.034741 -0.11489 0.075093 0.099526 0.048183 -0.073775 -0.41817 0.0041268 0.44414 -0.16062 0.14294 -2.2628 -0.027347 0.81311 0.77417 -0.25639 -0.11576 -0.11982 -0.21363 0.028429 0.27261 0.031026 0.096782 0.0067769 0.14082 -0.013064 -0.29686 -0.079913 0.195 0.031549 0.28506 -0.087461 0.0090611 -0.20989 0.053913
    of -0.076947 -0.021211 0.21271 -0.72232 -0.13988 -0.12234 -0.17521 0.12137 -0.070866 -1.5721 -0.22464 0.04269 -0.4018 0.21006 0.014288 0.41628 0.017165 0.071732 0.0069246 0.18107 -0.15412 0.14933 -0.030493 0.29918 0.029479 -0.036147 -0.061125 0.083918 -0.12398 -0.10077 -0.0054142 0.3371 -0.25612 0.44388 -0.68922 0.1802 0.34898 -0.052284 -0.26226 -0.47109 0.21647 -0.4002 -0.049986 0.011376 0.54994 -0.22791 0.095873 0.47693 -0.056727 -0.17895 0.11756 0.14662 0.048948 0.13587 -0.093821 0.45968 -0.32062 0.29911 0.20656 -0.18503 -0.2769 -0.022545 0.70698 -0.23815 0.16437 -0.55044 -0.0010615 0.12266 0.11898 0.23985 0.29815 0.013207 0.16316 -0.61334 -0.37051 0.19444 -0.13621 -0.30426 -0.37715 0.065299 -0.15995 -0.56516 0.074696 0.40184 0.19328 0.041802 0.20572 0.28971 0.34783 0.33873 -0.10052 -0.16397 -0.15236 -0.086815 0.36522 0.14969 -0.40859 0.23106 0.17162 -0.60545 0.086019 0.37043 0.17937 -0.40282 -0.62471 -0.055919 0.15092 0.12554 -0.45344 0.34417 0.40042 -0.049512 -0.29969 -0.31761 0.30023 0.090029 0.3106 -0.033077 -0.21995 -0.40396 -0.34443 -0.21248 -0.37636 0.21835 -0.1785 -0.17261 0.16391 0.22753 0.2686 0.57541 -0.14912 0.20413 0.22187 -0.27014 0.068253 0.29115 -0.067943 0.10623 -0.16281 0.19939 -0.48613 0.035688 -0.12373 0.13707 0.33359 -0.12713 -0.31711 -0.13962 -0.04288 -0.0014614 0.76883 -0.41705 -0.092911 0.16315 0.29202 0.12119 -0.076683 0.14131 -0.093406 -0.042796 0.13738 0.014278 0.11918 -0.34215 -0.19076 -0.12499 0.24648 0.42259 0.091966 0.45351 0.14437 0.1878 -0.85876 0.059621 -0.32242 0.28627 0.12427 0.0090984 -0.1891 0.16638 0.099881 -0.048553 -0.026257 0.099904 0.12406 -0.015416 -0.29707 -0.4044 -0.17258 0.36468 -0.014118 -0.11889 -0.11686 -0.14124 0.28012 0.067644 0.1485 -0.35702 0.29626 0.36004 1.019 -0.067307 -0.11588 -0.2178 0.070191 0.23154 -0.13849 0.26441 0.28742 0.1941 -0.0060504 0.44105 0.12416 -0.27745 -0.25729 0.10992 0.18362 -0.34522 -0.21861 -0.18825 -0.037454 -0.20862 -0.25216 0.060842 0.068595 0.10275 0.10745 -0.061288 0.19725 -0.27739 -0.022559 0.052794 -0.24083 0.09199 0.30959 0.054999 0.063676 -0.087357 -0.34495 0.22793 -0.42405 0.24536 0.55708 0.19126 -0.797 -0.2048 0.32545 0.09235 0.084791 -0.16433 -0.066568 -0.099249 0.31526 -0.44465 0.087281 0.3288 -0.017809 -0.23855 -0.12848 0.041509 0.46728 0.48214 0.10548 0.065805 0.067221 0.13321 -0.27856 0.015532 0.30026 0.38748 -0.14401 -0.16131 0.17678 0.16448 -0.3244 0.007937 -2.2836 0.096945 0.66131 0.16857 -0.028877 -0.10791 -0.027445 -0.25695 0.046686 0.23087 -0.076458 0.27127 0.25185 0.054947 -0.36673 -0.38603 0.3029 0.015747 0.34036 0.47841 0.068617 0.18351 -0.29183 -0.046533

'''

for line in fEmbeddings:
    split = line.strip().split(" ")
    word = split[0]
    
    if len(word2Idx) == 0: #Add padding+unknown
        word2Idx["PADDING_TOKEN"] = len(word2Idx)
        vector = np.zeros(len(split)-1) #Zero vector vor 'PADDING' word
        wordEmbeddings.append(vector)

        word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
        vector = np.random.uniform(-0.25, 0.25, len(split)-1)
        wordEmbeddings.append(vector)

    if split[0].lower() in words:
        vector = np.array([float(num) for num in split[1:]])
        wordEmbeddings.append(vector)
        word2Idx[split[0]] = len(word2Idx)
        
wordEmbeddings = np.array(wordEmbeddings)

In [14]:
print(word2Idx)




In [15]:
print(wordEmbeddings[2]) # gives the word embedding(100 dimensional corresponding to every word. The index in wordEmbeddings 
                         # corresponds to the index of the word in word to id.
print(len(wordEmbeddings[0]))

[-0.038194 -0.24487   0.72812  -0.39961   0.083172  0.043953 -0.39141
  0.3344   -0.57545   0.087459  0.28787  -0.06731   0.30906  -0.26384
 -0.13231  -0.20757   0.33395  -0.33848  -0.31743  -0.48336   0.1464
 -0.37304   0.34577   0.052041  0.44946  -0.46971   0.02628  -0.54155
 -0.15518  -0.14107  -0.039722  0.28277   0.14393   0.23464  -0.31021
  0.086173  0.20397   0.52624   0.17164  -0.082378 -0.71787  -0.41531
  0.20335  -0.12763   0.41367   0.55187   0.57908  -0.33477  -0.36559
 -0.54857  -0.062892  0.26584   0.30205   0.99775  -0.80481  -3.0243
  0.01254  -0.36942   2.2167    0.72201  -0.24978   0.92136   0.034514
  0.46745   1.1079   -0.19358  -0.074575  0.23353  -0.052062 -0.22044
  0.057162 -0.15806  -0.30798  -0.41625   0.37972   0.15006  -0.53212
 -0.2055   -1.2526    0.071624  0.70565   0.49744  -0.42063   0.26148
 -1.538    -0.30223  -0.073438 -0.28312   0.37104  -0.25217   0.016215
 -0.017099 -0.38984   0.87424  -0.72569  -0.51058  -0.52028  -0.1459
  0.8278    0.27062 ]

In [16]:
# char2Idx = {"PADDING":0, "UNKNOWN":1}
# for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|":
#     char2Idx[c] = len(char2Idx)

In [17]:
# print(char2Idx)

In [18]:
print(trainSentences[0])

[['EU', 'B-ORG\n'], ['rejects', 'O\n'], ['German', 'B-MISC\n'], ['call', 'O\n'], ['to', 'O\n'], ['boycott', 'O\n'], ['British', 'B-MISC\n'], ['lamb', 'O\n'], ['.', 'O\n']]


In [19]:
# createMatrices: for every sentence, changes its words, cases,characters, labels to its corresponding id in their embeddings
# padding is used to pad the character indices to a fixed size=52
train_set = createMatrices(trainSentences,word2Idx,  label2Idx, case2Idx)
dev_set = createMatrices(devSentences,word2Idx, label2Idx, case2Idx)
test_set = createMatrices(testSentences, word2Idx, label2Idx, case2Idx)

In [20]:
# train-set[0]: corresponds to the ids of the words in the sentence
# train_set[1]: corresponds to the ids of the cases of the words
# train_set[2]: contains numpy arrays, one corresponding to every word, each containing the indices of the characters of that word
# the numpy arrays have a fixed size (padding or truncation) to 52
# train_set[3]: corresponds to the ids of the labels of every word

print(train_set[0])
print(len(train_set[0][0]))
print(len(train_set[0][2]))

[[641, 6732, 512, 578, 6, 4940, 295, 8353, 4], [2, 1, 3, 1, 1, 1, 3, 1, 4], [0, 4, 2, 4, 4, 4, 2, 4, 4]]
9
9


In [21]:
idx2Label = {v: k for k, v in label2Idx.items()}

train_batch,train_batch_len = createBatches(train_set)
dev_batch,dev_batch_len = createBatches(dev_set)
test_batch,test_batch_len = createBatches(test_set)

In [22]:
words_input = Input(shape=(None,),dtype='int32',name='words_input')
words = Embedding(input_dim=wordEmbeddings.shape[0], output_dim=wordEmbeddings.shape[1],  weights=[wordEmbeddings], trainable=False)(words_input)
casing_input = Input(shape=(None,), dtype='int32', name='casing_input')
casing = Embedding(output_dim=caseEmbeddings.shape[1], input_dim=caseEmbeddings.shape[0], weights=[caseEmbeddings], trainable=False)(casing_input)
# character_input=Input(shape=(None,52,),name='char_input')
# embed_char_out=TimeDistributed(Embedding(len(char2Idx),30,embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)), name='char_embedding')(character_input)
# dropout= Dropout(0.5)(embed_char_out)
# conv1d_out= TimeDistributed(Conv1D(kernel_size=3, filters=30, padding='same',activation='tanh', strides=1))(dropout)
# maxpool_out=TimeDistributed(MaxPooling1D(52))(conv1d_out)
# char = TimeDistributed(Flatten())(maxpool_out)
# char = Dropout(0.5)(char)
output = concatenate([words, casing,])
output = Bidirectional(LSTM(200, return_sequences=True, dropout=0.50, recurrent_dropout=0.25))(output)
output = TimeDistributed(Dense(len(label2Idx), activation='softmax'))(output)
model = Model(inputs=[words_input, casing_input,], outputs=[output])
model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam')
model.summary()
plot_model(model, to_file='model.png')


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
words_input (InputLayer)        (None, None)         0                                            
__________________________________________________________________________________________________
casing_input (InputLayer)       (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 100)    2294900     words_input[0][0]                
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 8)      64          casing_input[0][0]               
__________________________________________________________________________________________________
concatenat

In [23]:
for epoch in range(epochs):    
    print("Epoch %d/%d"%(epoch,epochs))
    a = Progbar(len(train_batch_len))
    for i,batch in enumerate(iterate_minibatches(train_batch,train_batch_len)):
        labels, tokens, casing = batch       
        model.train_on_batch([tokens, casing], labels)
        a.update(i)
        print(' ')

Epoch 0/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 1:55 
 
 3/64 [>.............................] - ETA: 39s  
 4/64 [>.............................] - ETA: 30s 
 5/64 [=>............................] - ETA: 24s 
 6/64 [=>............................] - ETA: 21s 
 7/64 [==>...........................] - ETA: 19s 
 8/64 [==>...........................] - ETA: 17s 
 9/64 [===>..........................] - ETA: 15s 
10/64 [===>..........................] - ETA: 14s 
11/64 [====>.........................] - ETA: 12s 
12/64 [====>.........................] - ETA: 12s 
13/64 [=====>........................] - ETA: 11s 
14/64 [=====>........................] - ETA: 10s 
Epoch 1/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 7s 
 2/64 [..............................] - ETA: 5s 
 3/64 [>.............................] - ETA: 5s 
 4/64 [>.............................] - ETA: 5s 
 5/64 [=>..

Epoch 3/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 4s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>............................] - ETA: 4s 
 7/64 [==>...........................] - ETA: 4s 
 8/64 [==>...........................] - ETA: 4s 
 9/64 [===>..........................] - ETA: 4s 
10/64 [===>..........................] - ETA: 4s 
11/64 [====>.........................] - ETA: 4s 
12/64 [====>.........................] - ETA: 4s 
13/64 [=====>........................] - ETA: 4s 
14/64 [=====>........................] - ETA: 4s 
Epoch 4/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 4s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>...............

10/64 [===>..........................] - ETA: 4s 
11/64 [====>.........................] - ETA: 4s 
12/64 [====>.........................] - ETA: 4s 
13/64 [=====>........................] - ETA: 4s 
14/64 [=====>........................] - ETA: 3s 
Epoch 6/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 4s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>............................] - ETA: 3s 
 7/64 [==>...........................] - ETA: 4s 
 8/64 [==>...........................] - ETA: 4s 
 9/64 [===>..........................] - ETA: 4s 
10/64 [===>..........................] - ETA: 4s 
11/64 [====>.........................] - ETA: 4s 
12/64 [====>.........................] - ETA: 4s 
13/64 [=====>........................] - ETA: 4s 
14/64 [=====>........................] - ETA: 3s 
Epoch 7/50
 0/64 [...................

Epoch 8/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 4s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>............................] - ETA: 3s 
 7/64 [==>...........................] - ETA: 4s 
 8/64 [==>...........................] - ETA: 4s 
 9/64 [===>..........................] - ETA: 4s 
10/64 [===>..........................] - ETA: 4s 
11/64 [====>.........................] - ETA: 4s 
12/64 [====>.........................] - ETA: 4s 
13/64 [=====>........................] - ETA: 4s 
14/64 [=====>........................] - ETA: 3s 
Epoch 9/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 4s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>...............

Epoch 11/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 5s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>............................] - ETA: 3s 
 7/64 [==>...........................] - ETA: 4s 
 8/64 [==>...........................] - ETA: 4s 
 9/64 [===>..........................] - ETA: 4s 
10/64 [===>..........................] - ETA: 4s 
11/64 [====>.........................] - ETA: 4s 
12/64 [====>.........................] - ETA: 4s 
13/64 [=====>........................] - ETA: 4s 
14/64 [=====>........................] - ETA: 4s 
Epoch 12/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 5s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>.............

Epoch 13/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 4s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>............................] - ETA: 4s 
 7/64 [==>...........................] - ETA: 4s 
 8/64 [==>...........................] - ETA: 4s 
 9/64 [===>..........................] - ETA: 4s 
10/64 [===>..........................] - ETA: 4s 
11/64 [====>.........................] - ETA: 4s 
12/64 [====>.........................] - ETA: 4s 
13/64 [=====>........................] - ETA: 4s 
14/64 [=====>........................] - ETA: 4s 
Epoch 14/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 4s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>.............

Epoch 16/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 4s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>............................] - ETA: 3s 
 7/64 [==>...........................] - ETA: 4s 
 8/64 [==>...........................] - ETA: 4s 
 9/64 [===>..........................] - ETA: 4s 
10/64 [===>..........................] - ETA: 4s 
11/64 [====>.........................] - ETA: 4s 
12/64 [====>.........................] - ETA: 4s 
13/64 [=====>........................] - ETA: 4s 
14/64 [=====>........................] - ETA: 4s 
Epoch 17/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 5s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>.............

 6/64 [=>............................] - ETA: 4s 
 7/64 [==>...........................] - ETA: 4s 
 8/64 [==>...........................] - ETA: 4s 
 9/64 [===>..........................] - ETA: 4s 
10/64 [===>..........................] - ETA: 4s 
11/64 [====>.........................] - ETA: 4s 
12/64 [====>.........................] - ETA: 4s 
13/64 [=====>........................] - ETA: 4s 
14/64 [=====>........................] - ETA: 4s 
Epoch 19/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 4s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>............................] - ETA: 3s 
 7/64 [==>...........................] - ETA: 4s 
 8/64 [==>...........................] - ETA: 4s 
 9/64 [===>..........................] - ETA: 4s 
10/64 [===>..........................] - ETA: 4s 
11/64 [====>........................

Epoch 21/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 4s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>............................] - ETA: 4s 
 7/64 [==>...........................] - ETA: 4s 
 8/64 [==>...........................] - ETA: 4s 
 9/64 [===>..........................] - ETA: 4s 
10/64 [===>..........................] - ETA: 4s 
11/64 [====>.........................] - ETA: 4s 
12/64 [====>.........................] - ETA: 4s 
13/64 [=====>........................] - ETA: 4s 
14/64 [=====>........................] - ETA: 3s 
Epoch 22/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 5s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>.............

Epoch 24/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 4s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>............................] - ETA: 3s 
 7/64 [==>...........................] - ETA: 4s 
 8/64 [==>...........................] - ETA: 4s 
 9/64 [===>..........................] - ETA: 4s 
10/64 [===>..........................] - ETA: 4s 
11/64 [====>.........................] - ETA: 4s 
12/64 [====>.........................] - ETA: 4s 
13/64 [=====>........................] - ETA: 4s 
14/64 [=====>........................] - ETA: 3s 
Epoch 25/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 4s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>.............

Epoch 26/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 4s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>............................] - ETA: 4s 
 7/64 [==>...........................] - ETA: 4s 
 8/64 [==>...........................] - ETA: 4s 
 9/64 [===>..........................] - ETA: 4s 
10/64 [===>..........................] - ETA: 4s 
11/64 [====>.........................] - ETA: 4s 
12/64 [====>.........................] - ETA: 4s 
13/64 [=====>........................] - ETA: 4s 
14/64 [=====>........................] - ETA: 3s 
Epoch 27/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 4s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>.............

Epoch 29/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 5s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>............................] - ETA: 4s 
 7/64 [==>...........................] - ETA: 4s 
 8/64 [==>...........................] - ETA: 4s 
 9/64 [===>..........................] - ETA: 4s 
10/64 [===>..........................] - ETA: 4s 
11/64 [====>.........................] - ETA: 4s 
12/64 [====>.........................] - ETA: 4s 
13/64 [=====>........................] - ETA: 4s 
14/64 [=====>........................] - ETA: 4s 
Epoch 30/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 4s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>.............

 1/64 [..............................] - ETA: 5s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>............................] - ETA: 4s 
 7/64 [==>...........................] - ETA: 4s 
 8/64 [==>...........................] - ETA: 5s 
 9/64 [===>..........................] - ETA: 4s 
10/64 [===>..........................] - ETA: 4s 
11/64 [====>.........................] - ETA: 4s 
12/64 [====>.........................] - ETA: 4s 
13/64 [=====>........................] - ETA: 4s 
14/64 [=====>........................] - ETA: 4s 
Epoch 32/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 4s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>............................] - ETA: 4s 
 7/64 [==>........................

Epoch 34/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 4s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>............................] - ETA: 4s 
 7/64 [==>...........................] - ETA: 4s 
 8/64 [==>...........................] - ETA: 4s 
 9/64 [===>..........................] - ETA: 4s 
10/64 [===>..........................] - ETA: 4s 
11/64 [====>.........................] - ETA: 4s 
12/64 [====>.........................] - ETA: 4s 
13/64 [=====>........................] - ETA: 4s 
14/64 [=====>........................] - ETA: 3s 
Epoch 35/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 4s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>.............

13/64 [=====>........................] - ETA: 4s 
14/64 [=====>........................] - ETA: 3s 
Epoch 37/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 4s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>............................] - ETA: 4s 
 7/64 [==>...........................] - ETA: 4s 
 8/64 [==>...........................] - ETA: 4s 
 9/64 [===>..........................] - ETA: 4s 
10/64 [===>..........................] - ETA: 4s 
11/64 [====>.........................] - ETA: 4s 
12/64 [====>.........................] - ETA: 4s 
13/64 [=====>........................] - ETA: 4s 
14/64 [=====>........................] - ETA: 4s 
Epoch 38/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 5s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>..............

Epoch 39/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 5s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>............................] - ETA: 4s 
 7/64 [==>...........................] - ETA: 4s 
 8/64 [==>...........................] - ETA: 4s 
 9/64 [===>..........................] - ETA: 4s 
10/64 [===>..........................] - ETA: 4s 
11/64 [====>.........................] - ETA: 4s 
12/64 [====>.........................] - ETA: 4s 
13/64 [=====>........................] - ETA: 4s 
14/64 [=====>........................] - ETA: 3s 
Epoch 40/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 4s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>.............

Epoch 42/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 5s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>............................] - ETA: 4s 
 7/64 [==>...........................] - ETA: 4s 
 8/64 [==>...........................] - ETA: 4s 
 9/64 [===>..........................] - ETA: 4s 
10/64 [===>..........................] - ETA: 4s 
11/64 [====>.........................] - ETA: 4s 
12/64 [====>.........................] - ETA: 4s 
13/64 [=====>........................] - ETA: 4s 
14/64 [=====>........................] - ETA: 4s 
Epoch 43/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 5s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>.............

Epoch 44/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 4s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>............................] - ETA: 4s 
 7/64 [==>...........................] - ETA: 4s 
 8/64 [==>...........................] - ETA: 4s 
 9/64 [===>..........................] - ETA: 4s 
10/64 [===>..........................] - ETA: 4s 
11/64 [====>.........................] - ETA: 4s 
12/64 [====>.........................] - ETA: 4s 
13/64 [=====>........................] - ETA: 4s 
14/64 [=====>........................] - ETA: 4s 
Epoch 45/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 5s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>.............

Epoch 47/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 5s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>............................] - ETA: 4s 
 7/64 [==>...........................] - ETA: 4s 
 8/64 [==>...........................] - ETA: 4s 
 9/64 [===>..........................] - ETA: 4s 
10/64 [===>..........................] - ETA: 4s 
11/64 [====>.........................] - ETA: 4s 
12/64 [====>.........................] - ETA: 4s 
13/64 [=====>........................] - ETA: 4s 
14/64 [=====>........................] - ETA: 4s 
Epoch 48/50
 0/64 [..............................] - ETA: 0s 
 1/64 [..............................] - ETA: 4s 
 
 3/64 [>.............................] - ETA: 3s 
 4/64 [>.............................] - ETA: 3s 
 5/64 [=>............................] - ETA: 3s 
 6/64 [=>.............

 9/64 [===>..........................] - ETA: 4s 
10/64 [===>..........................] - ETA: 4s 
11/64 [====>.........................] - ETA: 4s 
12/64 [====>.........................] - ETA: 4s 
13/64 [=====>........................] - ETA: 4s 
14/64 [=====>........................] - ETA: 4s 


In [24]:
import collections
# a = [1,1,1,1,2,2,2,2,3,3,4,5,5]
predLabels, correctLabels = tag_dataset(dev_batch)   
correctLabels = np.concatenate(correctLabels).ravel()
counter=collections.Counter(correctLabels)




In [25]:
print(counter)

Counter({4: 42759, 6: 1842, 5: 1837, 0: 1341, 1: 1307, 2: 922, 7: 751, 8: 346, 3: 257})


In [26]:
predLabels, correctLabels = tag_dataset(dev_batch)        
predLabels = np.concatenate(predLabels).ravel()
correctLabels = np.concatenate(correctLabels).ravel()
print(idx2Label)



In [27]:
print(sklearn.metrics.f1_score(correctLabels,predLabels,average='macro' ))
print(sklearn.metrics.f1_score(correctLabels, predLabels, average='micro'))
print(sklearn.metrics.f1_score(correctLabels, predLabels, average='weighted'))
print(sklearn.metrics.f1_score(correctLabels, predLabels, average=None))

0.9262071673213702
0.9871110938047584
0.9870535396563499
[0.92330715 0.9741051  0.89962426 0.90944882 0.99669188 0.96302338
 0.97039828 0.89014883 0.80911681]


In [28]:
#   Performance on test dataset       
predLabels, correctLabels = tag_dataset(test_batch)        
predLabels = np.concatenate(predLabels).ravel()
correctLabels = np.concatenate(correctLabels).ravel()

print(sklearn.metrics.f1_score(correctLabels,predLabels,average='macro' ))
print(sklearn.metrics.f1_score(correctLabels, predLabels, average='micro'))
print(sklearn.metrics.f1_score(correctLabels, predLabels, average='weighted'))
print(sklearn.metrics.f1_score(correctLabels, predLabels, average=None))

0.9782276300204588
0.9784922822610496
[0.88983051 0.97903295 0.81143635 0.83544304 0.99377989 0.9263658
 0.95465839 0.86068111 0.69456067]


In [29]:
model.save('conll_without_char.h5')