In [1]:
import tensorflow as tf
import keras

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
import os, csv, math, codecs

import spacy

Using TensorFlow backend.


#### Data loading

Read training, dev and validation data. Dataset are in below format

document id | sentence number | word | NER tag


In [2]:
train = pd.read_csv("../data/subtask1_train.csv")
val = pd.read_csv("../data/subtask1_dev.csv")
test = pd.read_csv('../data/subtask1_test.csv')

In [3]:
train['doc_sent'] = train.apply(lambda x: "{}_{}".format(str(x['document']),str(x['sentence'])), axis=1)
val['doc_sent'] = val.apply(lambda x: "{}_{}".format(str(x['document']),str(x['sentence'])), axis=1)
test['doc_sent'] = test.apply(lambda x: "{}_{}".format(str(x['document']),str(x['sentence'])), axis=1)

In [4]:
train = train[train.tag != 'NULL']
val = val[val.tag != 'NULL']
train = train[pd.notnull(train.tag)]
val = val[pd.notnull(val.tag)]

In [5]:
train.tag.value_counts()

OTHER               93482
NORMALIZABLES        2174
PROTEINAS            1501
UNCLEAR                68
NO_NORMALIZABLES       19
Name: tag, dtype: int64

In [6]:
val.tag.value_counts()

OTHER               47127
NORMALIZABLES        1048
PROTEINAS             836
UNCLEAR                27
NO_NORMALIZABLES       13
Name: tag, dtype: int64

In [7]:
import re
print (train.shape, val.shape)
#train['word'] = train['word'].apply(lambda x: re.sub(r'[^\w]','',x))
#train['wordlen'] = train.word.apply(lambda x: len(x))
#train = train[train.wordlen >= 2]
#print (train.shape)
#val['word'] = val['word'].apply(lambda x: re.sub(r'[^\w]','',x))
#val['wordlen'] = val.word.apply(lambda x: len(x))
#val = val[val.wordlen >= 2]
print (val.shape)
test.replace(np.nan,'',inplace=True)
#test['word'] = test['word'].apply(lambda x: x.lower())
#test['word'] = test['word'].apply(lambda x: re.sub(r'[^\w]','',str(x)))
#test['wordlen'] = test.word.apply(lambda x: len(x))
#test = test[test.wordlen >= 2]
print (test.shape)

(97244, 6) (49051, 8)
(49051, 8)
(872149, 7)


In [8]:
train.head(3)

Unnamed: 0,document,sentence,word,tag,pos,doc_sent
0,S0004-06142008000100008-1,0,presentamos,OTHER,VERB,S0004-06142008000100008-1_0
1,S0004-06142008000100008-1,0,caso,OTHER,NOUN,S0004-06142008000100008-1_0
2,S0004-06142008000100008-1,0,mujer,OTHER,NOUN,S0004-06142008000100008-1_0


In [9]:
test.head(5)

Unnamed: 0,document,sentence,n1,n2,word,pos,doc_sent
0,S0004-06142008000100008-1,0,0,11,presentamos,VERB,S0004-06142008000100008-1_0
1,S0004-06142008000100008-1,0,15,19,caso,NOUN,S0004-06142008000100008-1_0
2,S0004-06142008000100008-1,0,27,32,mujer,NOUN,S0004-06142008000100008-1_0
3,S0004-06142008000100008-1,0,36,38,30,NUM,S0004-06142008000100008-1_0
4,S0004-06142008000100008-1,0,39,43,años,NOUN,S0004-06142008000100008-1_0


In [10]:
test = pd.merge(test,pd.concat([train,val],axis=0),how='left')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [11]:
test.tag.value_counts()

OTHER               47127
NORMALIZABLES        1048
PROTEINAS             836
UNCLEAR                27
NO_NORMALIZABLES       13
Name: tag, dtype: int64

In [12]:
print (len(test[pd.notnull(test.tag)]), len(test))

49051 872149


In [13]:
test.word.nunique()

80159

In [14]:
all_chars = list(set(list(" ".join(test.word))))
print (len(all_chars))

143


In [15]:
BATCH_SIZE = 32
EPOCHS = 20
MAX_LEN = 300
EMBEDDING = 300
MAX_NB_WORDS = 80000
n_tags = train.tag.nunique()
words = list(set(test.word))
n_words = len(words)
n_chars = len(all_chars)
print (n_words, n_tags, n_chars)

80159 5 143


In [16]:
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, BatchNormalization, GRU, CuDNNLSTM, CuDNNGRU
from keras.layers import Conv1D, Conv2D, Concatenate, Dropout, MaxPooling1D, MaxPooling2D, Flatten, GlobalAveragePooling1D
from keras_contrib.layers import CRF
from keras.callbacks import EarlyStopping, ModelCheckpoint

#### Data preprocessing for LSTM model

Convert the input sentences into sequence of words with maximum length as 300. For outputs we one hot encode. Additionally, we add 'PAD' to shorter input texts, as well as in the outputs for TAGs.

In [17]:
#words = list(set(train.word))
tags = list(set(train.tag))

word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1 # Unknown words
word2idx["PAD"] = 0 # Padding

# Vocabulary Key:token_index -> Value:word
idx2word = {i: w for w, i in word2idx.items()}

# Vocabulary Key:Label/Tag -> Value:tag_index
# The first entry is reserved for PAD
tag2idx = {t: i+1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0

# Vocabulary Key:tag_index -> Value:Label/Tag
idx2tag = {i: w for w, i in tag2idx.items()}

In [18]:
# Convert each sentence from list of Token to list of word_index
trainX_word = [[word2idx[w] for w in list(train[train.doc_sent == s].word)] for s in train.doc_sent.unique()]
# Padding each sentence to have the same lenght
trainX_word = pad_sequences(maxlen=MAX_LEN, sequences=trainX_word, padding="post", value=word2idx["PAD"])

valX_word = [[word2idx.get(w,1) for w in list(val[val.doc_sent == s].word)] for s in val.doc_sent.unique()]
# Padding each sentence to have the same lenght
valX_word = pad_sequences(maxlen=MAX_LEN, sequences=valX_word, padding="post", value=word2idx["PAD"])

trainy = [[tag2idx[w] for w in list(train[train.doc_sent == s].tag)] for s in train.doc_sent.unique()]
# Padding each sentence to have the same lenght
trainy = pad_sequences(maxlen=MAX_LEN, sequences=trainy, padding="post", value=word2idx["PAD"])

valy = [[tag2idx[w] for w in list(val[val.doc_sent == s].tag)] for s in val.doc_sent.unique()]
# Padding each sentence to have the same lenght
valy = pad_sequences(maxlen=MAX_LEN, sequences=valy, padding="post", value=word2idx["PAD"])

# One-Hot encode
trainy = [to_categorical(i, num_classes=n_tags+1) for i in trainy]  # n_tags+1(PAD)
valy = [to_categorical(i, num_classes=n_tags+1) for i in valy]  # n_tags+1(PAD)

print (np.array(trainX_word).shape, np.array(valX_word).shape, np.array(trainy).shape, np.array(valy).shape)

(2315, 300) (1086, 300) (2315, 300, 6) (1086, 300, 6)


In [19]:
from keras.engine.topology import Layer
import keras.backend as K

class Position_Embedding(Layer):
    
    def __init__(self, size=None, mode='sum', **kwargs):
        self.size = size
        self.mode = mode
        super(Position_Embedding, self).__init__(**kwargs)
        
    def call(self, x):
        if (self.size == None) or (self.mode == 'sum'):
            self.size = int(x.shape[-1])
        batch_size,seq_len = K.shape(x)[0],K.shape(x)[1]
        position_j = 1. / K.pow(10000., \
                                 2 * K.arange(self.size / 2, dtype='float32' \
                               ) / self.size)
        position_j = K.expand_dims(position_j, 0)
        position_i = K.cumsum(K.ones_like(x[:,:,0]), 1)-1 
        position_i = K.expand_dims(position_i, 2)
        position_ij = K.dot(position_i, position_j)
        position_ij = K.concatenate([K.cos(position_ij), K.sin(position_ij)], 2)
        if self.mode == 'sum':
            return position_ij + x
        elif self.mode == 'concat':
            return K.concatenate([position_ij, x], 2)
        
    def compute_output_shape(self, input_shape):
        if self.mode == 'sum':
            return input_shape
        elif self.mode == 'concat':
            return (input_shape[0], input_shape[1], input_shape[2]+self.size)


'''
output dimention: [batch_size, time_step, nb_head*size_per_head]
every word can be represented as a vector [nb_head*size_per_head]
'''
class Attention(Layer):

    def __init__(self, nb_head, size_per_head, **kwargs):
        self.nb_head = nb_head
        self.size_per_head = size_per_head
        self.output_dim = nb_head*size_per_head
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.WQ = self.add_weight(name='WQ', 
                                  shape=(input_shape[0][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        self.WK = self.add_weight(name='WK', 
                                  shape=(input_shape[1][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        self.WV = self.add_weight(name='WV', 
                                  shape=(input_shape[2][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        super(Attention, self).build(input_shape)
        
    def Mask(self, inputs, seq_len, mode='mul'):
        if seq_len == None:
            return inputs
        else:
            mask = K.one_hot(seq_len[:,0], K.shape(inputs)[1])
            mask = 1 - K.cumsum(mask, 1)
            for _ in range(len(inputs.shape)-2):
                mask = K.expand_dims(mask, 2)
            if mode == 'mul':
                return inputs * mask
            if mode == 'add':
                return inputs - (1 - mask) * 1e12
                
    def call(self, x):
        if len(x) == 3:
            Q_seq,K_seq,V_seq = x
            Q_len,V_len = None,None
        elif len(x) == 5:
            Q_seq,K_seq,V_seq,Q_len,V_len = x
        Q_seq = K.dot(Q_seq, self.WQ)
        Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head))
        Q_seq = K.permute_dimensions(Q_seq, (0,2,1,3))
        K_seq = K.dot(K_seq, self.WK)
        K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head))
        K_seq = K.permute_dimensions(K_seq, (0,2,1,3))
        V_seq = K.dot(V_seq, self.WV)
        V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head))
        V_seq = K.permute_dimensions(V_seq, (0,2,1,3))
        A = K.batch_dot(Q_seq, K_seq, axes=[3,3]) / self.size_per_head**0.5
        A = K.permute_dimensions(A, (0,3,2,1))
        A = self.Mask(A, V_len, 'add')
        A = K.permute_dimensions(A, (0,3,2,1))    
        A = K.softmax(A)
        O_seq = K.batch_dot(A, V_seq, axes=[3,2])
        O_seq = K.permute_dimensions(O_seq, (0,2,1,3))
        O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim))
        O_seq = self.Mask(O_seq, Q_len, 'mul')
        return O_seq
        
    def compute_output_shape(self, input_shape):
        return (input_shape[0][0], input_shape[0][1], self.output_dim)

In [20]:
print('loading word embeddings...')
embeddings_index = {}
f = codecs.open('/Users/victor/Documents/Models/cc.es.300.vec', encoding='utf-8')
for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

loading word embeddings...

747it [00:00, 7464.76it/s]




2000001it [05:25, 6138.48it/s]


In [21]:
print('preparing embedding matrix...')
EMBEDDING = 300
words_not_found = []
embedding_matrix = np.random.uniform(low=-.25,high=.25,size=(n_words+2, EMBEDDING))
for word, i in word2idx.items():
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print (len(words_not_found))

preparing embedding matrix...
42579


In [23]:
# Model definition
input_word = Input(shape=(MAX_LEN,))
out_word = Embedding(input_dim=n_words+2, output_dim=EMBEDDING, # n_words + 2 (PAD & UNK)
                  input_length=MAX_LEN, mask_zero=False)(input_word)  # default: 20-dim embedding


#lstm1 = Bidirectional(LSTM(units=150, return_sequences=True))(out_word)  # variational biLSTM
#conv1 = Conv1D(kernel_size=3, filters=300, padding='same', strides=1)(out_word)
#conv2 = Conv1D(kernel_size=5, filters=300, padding='same', strides=1)(out_word)
#conv3 = Conv1D(kernel_size=7, filters=300, padding='same', strides=1)(out_word)

#model = Concatenate()([lstm1,conv1,conv2,conv3])

#model = Bidirectional(LSTM(units=150, return_sequences=True))(model)  # variational biLSTM
#model = Conv1D(kernel_size=3, filters=300, padding='same', strides=1)(model)

model = Bidirectional(LSTM(units=100, return_sequences=True))(out_word)

model = Position_Embedding()(model)
model = Attention(nb_head=10,size_per_head=10)([model,model,model])

#model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
#model = BatchNormalization()(model)
#model = Bidirectional(LSTM(units=100, activation='relu', return_sequences=True))(model)
model = TimeDistributed(Dense(50))(model)

encoding = Model(input_word, model)

crf = CRF(n_tags+1)  # CRF layer, n_tags+1(PAD)
out = crf(model)  # output

model = Model(input_word, out)
model.compile(optimizer='adam', loss=crf.loss_function, metrics=[crf.accuracy])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 300)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 300, 300)     24048300    input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 300, 200)     320800      embedding_2[0][0]                
__________________________________________________________________________________________________
position__embedding_2 (Position (None, 300, 200)     0           bidirectional_2[0][0]            
__________________________________________________________________________________________________
attention_

In [24]:
EPOCHS=30
#reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_crf_viterbi_accuracy', factor=0.2,
#                              patience=3, min_lr=0.005)
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0002, 
                                           patience=5, verbose=0, mode='min')
history = model.fit(np.array(trainX_word), np.array(trainy), batch_size=BATCH_SIZE, epochs=EPOCHS,verbose=2,
                    validation_data=(np.array(valX_word),np.array(valy)),
                   callbacks = [early_stop])

Instructions for updating:
Use tf.cast instead.
Train on 2315 samples, validate on 1086 samples
Epoch 1/30
 - 85s - loss: 0.2415 - crf_viterbi_accuracy: 0.9239 - val_loss: 0.0365 - val_crf_viterbi_accuracy: 0.9930
Epoch 2/30
 - 84s - loss: 0.0293 - crf_viterbi_accuracy: 0.9942 - val_loss: 0.0298 - val_crf_viterbi_accuracy: 0.9939
Epoch 3/30
 - 84s - loss: 0.0270 - crf_viterbi_accuracy: 0.9944 - val_loss: 0.0275 - val_crf_viterbi_accuracy: 0.9940
Epoch 4/30
 - 82s - loss: 0.0206 - crf_viterbi_accuracy: 0.9944 - val_loss: 0.0199 - val_crf_viterbi_accuracy: 0.9938
Epoch 5/30
 - 80s - loss: 0.0120 - crf_viterbi_accuracy: 0.9955 - val_loss: 0.0146 - val_crf_viterbi_accuracy: 0.9957
Epoch 6/30
 - 87s - loss: 0.0068 - crf_viterbi_accuracy: 0.9971 - val_loss: 0.0143 - val_crf_viterbi_accuracy: 0.9962
Epoch 7/30
 - 82s - loss: 0.0046 - crf_viterbi_accuracy: 0.9982 - val_loss: 0.0171 - val_crf_viterbi_accuracy: 0.9964
Epoch 8/30
 - 81s - loss: 0.0032 - crf_viterbi_accuracy: 0.9988 - val_loss: 0.

In [25]:
with open("../models/model_bilstm_cnn_transformer_withoutfasttext.json", "w") as output:
    output.write(model.to_json())
    
model.save_weights("../models/model_bilstm_cnn_transformer_withoutfasttext.h5", overwrite=True)

In [27]:
pred_cat = model.predict(valX_word)
pred = np.argmax(pred_cat, axis=-1)
valy_true = np.argmax(valy, -1)

In [28]:
from sklearn_crfsuite.metrics import flat_classification_report

# Convert the index to tag
pred_tag = [[idx2tag[i] for i in row] for row in pred]
valy_true_tag = [[idx2tag[i] for i in row] for row in valy_true] 

report = flat_classification_report(y_pred=pred_tag, y_true=valy_true_tag)
print(report)

  'precision', 'predicted', average, warn_for)


                  precision    recall  f1-score   support

   NORMALIZABLES       0.91      0.60      0.72      1048
NO_NORMALIZABLES       0.00      0.00      0.00        13
           OTHER       0.99      1.00      0.99     47099
             PAD       1.00      1.00      1.00    276777
       PROTEINAS       0.82      0.69      0.75       836
         UNCLEAR       1.00      0.04      0.07        27

       micro avg       1.00      1.00      1.00    325800
       macro avg       0.79      0.55      0.59    325800
    weighted avg       1.00      1.00      1.00    325800



In [29]:
val['tag_pred'] = ''
for i, value in enumerate(tqdm(val.doc_sent.unique())):
    if len(val[val.doc_sent == value]) <= MAX_LEN:
        val.loc[val.doc_sent == value,'tag_pred'] = pred_tag[i][:len(val[val.doc_sent == value])]
    else:
        val.loc[val.doc_sent == value,'tag_pred'] = pred_tag[i] + ['OTHER']*(len(val[val.doc_sent == value]) - MAX_LEN)
        
val.to_csv('../data/val_submission/bilstm_transformer_withoutfasttext.csv',index=False)


100%|██████████| 1086/1086 [00:30<00:00, 32.66it/s]


In [31]:
# Model definition
input_word = Input(shape=(MAX_LEN,))
out_word = Embedding(input_dim=n_words+2, output_dim=EMBEDDING,weights=[embedding_matrix],trainable=True, # n_words + 2 (PAD & UNK)
                  input_length=MAX_LEN, mask_zero=False)(input_word)  # default: 20-dim embedding


#lstm1 = Bidirectional(LSTM(units=150, return_sequences=True))(out_word)  # variational biLSTM
#conv1 = Conv1D(kernel_size=3, filters=300, padding='same', strides=1)(out_word)
#conv2 = Conv1D(kernel_size=5, filters=300, padding='same', strides=1)(out_word)
#conv3 = Conv1D(kernel_size=7, filters=300, padding='same', strides=1)(out_word)

#model = Concatenate()([lstm1,conv1,conv2,conv3])

#model = Bidirectional(LSTM(units=150, return_sequences=True))(model)  # variational biLSTM
#model = Conv1D(kernel_size=3, filters=300, padding='same', strides=1)(model)

model = Bidirectional(LSTM(units=100, return_sequences=True))(out_word)

model = Position_Embedding()(out_word)
model = Attention(nb_head=10,size_per_head=10)([model,model,model])

#model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
#model = BatchNormalization()(model)
#model = Bidirectional(LSTM(units=100, activation='relu', return_sequences=True))(model)
model = TimeDistributed(Dense(50))(model)

encoding = Model(input_word, model)

crf = CRF(n_tags+1)  # CRF layer, n_tags+1(PAD)
out = crf(model)  # output

model = Model(input_word, out)
model.compile(optimizer='adam', loss=crf.loss_function, metrics=[crf.accuracy])

model.summary()



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 300)          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 300, 300)     24048300    input_3[0][0]                    
__________________________________________________________________________________________________
position__embedding_3 (Position (None, 300, 300)     0           embedding_3[0][0]                
__________________________________________________________________________________________________
attention_3 (Attention)         (None, 300, 100)     90000       position__embedding_3[0][0]      
                                                                 position__embedding_3[0][0]      
          

In [32]:
EPOCHS=30
#reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_crf_viterbi_accuracy', factor=0.2,
#                              patience=3, min_lr=0.005)
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0002, 
                                           patience=5, verbose=0, mode='min')
history = model.fit(np.array(trainX_word), np.array(trainy), batch_size=BATCH_SIZE, epochs=EPOCHS,verbose=2,
                    validation_data=(np.array(valX_word),np.array(valy)),
                   callbacks = [early_stop])

Train on 2315 samples, validate on 1086 samples
Epoch 1/30
 - 62s - loss: 0.0973 - crf_viterbi_accuracy: 0.9755 - val_loss: 0.0301 - val_crf_viterbi_accuracy: 0.9940
Epoch 2/30
 - 52s - loss: 0.0228 - crf_viterbi_accuracy: 0.9947 - val_loss: 0.0181 - val_crf_viterbi_accuracy: 0.9951
Epoch 3/30
 - 53s - loss: 0.0108 - crf_viterbi_accuracy: 0.9967 - val_loss: 0.0143 - val_crf_viterbi_accuracy: 0.9959
Epoch 4/30
 - 48s - loss: 0.0064 - crf_viterbi_accuracy: 0.9981 - val_loss: 0.0132 - val_crf_viterbi_accuracy: 0.9969
Epoch 5/30
 - 51s - loss: 0.0041 - crf_viterbi_accuracy: 0.9990 - val_loss: 0.0149 - val_crf_viterbi_accuracy: 0.9969
Epoch 6/30
 - 51s - loss: 0.0031 - crf_viterbi_accuracy: 0.9992 - val_loss: 0.0134 - val_crf_viterbi_accuracy: 0.9970
Epoch 7/30
 - 49s - loss: 0.0025 - crf_viterbi_accuracy: 0.9994 - val_loss: 0.0150 - val_crf_viterbi_accuracy: 0.9975
Epoch 8/30
 - 53s - loss: 0.0020 - crf_viterbi_accuracy: 0.9995 - val_loss: 0.0154 - val_crf_viterbi_accuracy: 0.9973
Epoch 9/

In [33]:
with open("../models/model_bilstm_cnn_transformer_withfasttext.json", "w") as output:
    output.write(model.to_json())
    
model.save_weights("../models/model_bilstm_cnn_transformer_withfasttext.h5", overwrite=True)

In [35]:
pred_cat = model.predict(valX_word)
pred = np.argmax(pred_cat, axis=-1)
valy_true = np.argmax(valy, -1)

In [36]:
pred_tag = [[idx2tag[i] for i in row] for row in pred]
valy_true_tag = [[idx2tag[i] for i in row] for row in valy_true] 

report = flat_classification_report(y_pred=pred_tag, y_true=valy_true_tag)
print(report)

  'precision', 'predicted', average, warn_for)


                  precision    recall  f1-score   support

   NORMALIZABLES       0.67      0.70      0.68      1048
NO_NORMALIZABLES       0.00      0.00      0.00        13
           OTHER       0.99      0.99      0.99     47099
             PAD       1.00      1.00      1.00    276777
       PROTEINAS       0.68      0.72      0.70       836
         UNCLEAR       0.71      0.19      0.29        27

       micro avg       1.00      1.00      1.00    325800
       macro avg       0.68      0.60      0.61    325800
    weighted avg       1.00      1.00      1.00    325800



In [37]:
val['tag_pred'] = ''
for i, value in enumerate(tqdm(val.doc_sent.unique())):
    if len(val[val.doc_sent == value]) <= MAX_LEN:
        val.loc[val.doc_sent == value,'tag_pred'] = pred_tag[i][:len(val[val.doc_sent == value])]
    else:
        val.loc[val.doc_sent == value,'tag_pred'] = pred_tag[i] + ['OTHER']*(len(val[val.doc_sent == value]) - MAX_LEN)
        
val.to_csv('../data/val_submission/bilstm_transformer_withfasttext.csv',index=False)

100%|██████████| 1086/1086 [00:28<00:00, 37.66it/s]


#### Testing

In [20]:
# Model definition
input_word = Input(shape=(MAX_LEN,))
out_word = Embedding(input_dim=n_words+2, output_dim=EMBEDDING, # n_words + 2 (PAD & UNK)
                  input_length=MAX_LEN, mask_zero=False)(input_word)  # default: 20-dim embedding


#lstm1 = Bidirectional(LSTM(units=150, return_sequences=True))(out_word)  # variational biLSTM
#conv1 = Conv1D(kernel_size=3, filters=300, padding='same', strides=1)(out_word)
#conv2 = Conv1D(kernel_size=5, filters=300, padding='same', strides=1)(out_word)
#conv3 = Conv1D(kernel_size=7, filters=300, padding='same', strides=1)(out_word)

#model = Concatenate()([lstm1,conv1,conv2,conv3])

#model = Bidirectional(LSTM(units=150, return_sequences=True))(model)  # variational biLSTM
#model = Conv1D(kernel_size=3, filters=300, padding='same', strides=1)(model)

model = Bidirectional(LSTM(units=100, return_sequences=True))(out_word)

model = Position_Embedding()(model)
model = Attention(nb_head=10,size_per_head=10)([model,model,model])

#model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
#model = BatchNormalization()(model)
#model = Bidirectional(LSTM(units=100, activation='relu', return_sequences=True))(model)
model = TimeDistributed(Dense(50))(model)

encoding = Model(input_word, model)

crf = CRF(n_tags+1)  # CRF layer, n_tags+1(PAD)
out = crf(model)  # output

model = Model(input_word, out)
model.compile(optimizer='adam', loss=crf.loss_function, metrics=[crf.accuracy])

model.summary()

Instructions for updating:
Colocations handled automatically by placer.




__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 300)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 300, 300)     24048300    input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 300, 200)     320800      embedding_1[0][0]                
__________________________________________________________________________________________________
position__embedding_1 (Position (None, 300, 200)     0           bidirectional_1[0][0]            
__________________________________________________________________________________________________
attention_

In [23]:
train_valX = np.concatenate([np.array(trainX_word),np.array(valX_word)],axis=0)
train_valy = np.concatenate([np.array(trainy),np.array(valy)],axis=0)

print (train_valX.shape, train_valy.shape)

(3401, 300) (3401, 300, 6)


In [24]:
EPOCHS=15
#reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_crf_viterbi_accuracy', factor=0.2,
#                              patience=3, min_lr=0.005)
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0002, 
                                           patience=5, verbose=0, mode='min')

history = model.fit(train_valX, train_valy, batch_size=BATCH_SIZE, epochs=EPOCHS,verbose=2,validation_split=.1,
                   callbacks = [early_stop])

Instructions for updating:
Use tf.cast instead.
Train on 3060 samples, validate on 341 samples
Epoch 1/15
 - 132s - loss: 0.1391 - crf_viterbi_accuracy: 0.9519 - val_loss: 0.0342 - val_crf_viterbi_accuracy: 0.9942
Epoch 2/15
 - 125s - loss: 0.0303 - crf_viterbi_accuracy: 0.9943 - val_loss: 0.0303 - val_crf_viterbi_accuracy: 0.9944
Epoch 3/15
 - 127s - loss: 0.0251 - crf_viterbi_accuracy: 0.9944 - val_loss: 0.0206 - val_crf_viterbi_accuracy: 0.9944
Epoch 4/15
 - 126s - loss: 0.0140 - crf_viterbi_accuracy: 0.9954 - val_loss: 0.0145 - val_crf_viterbi_accuracy: 0.9960
Epoch 5/15
 - 129s - loss: 0.0078 - crf_viterbi_accuracy: 0.9977 - val_loss: 0.0133 - val_crf_viterbi_accuracy: 0.9976
Epoch 6/15
 - 135s - loss: 0.0049 - crf_viterbi_accuracy: 0.9989 - val_loss: 0.0125 - val_crf_viterbi_accuracy: 0.9980
Epoch 7/15
 - 131s - loss: 0.0034 - crf_viterbi_accuracy: 0.9994 - val_loss: 0.0127 - val_crf_viterbi_accuracy: 0.9982
Epoch 8/15
 - 126s - loss: 0.0026 - crf_viterbi_accuracy: 0.9996 - val_l

In [25]:
model.save_weights("../models/model_bilstm_cnn_transformer_withoutfasttext_test.h5", overwrite=True)

In [26]:
test_without_tag = test[pd.notnull(test.tag) == False]

testX = np.load('../data/testX.npy')
'''
testX = [[word2idx.get(w,1) for w in list(test_without_tag[test_without_tag.doc_sent == s].word)] for s in test_without_tag.doc_sent.unique()]
# Padding each sentence to have the same lenght
testX = pad_sequences(maxlen=MAX_LEN, sequences=testX, padding="post", value=word2idx["PAD"])
'''
print (testX.shape)

(17962, 300)


In [27]:
pred_cat = model.predict(testX)
pred = np.argmax(pred_cat, axis=-1)
pred_tag = [[idx2tag[i] for i in row] for row in pred]

In [28]:
test_without_tag['tag'] = ''
output = []
for i, val in enumerate(tqdm(test_without_tag.doc_sent.unique())):
    if len(test_without_tag[test_without_tag.doc_sent == val]) <= MAX_LEN:
        output += pred_tag[i][:len(test_without_tag[test_without_tag.doc_sent == val])]
    else:
        output += pred_tag[i] + ['OTHER']*(len(test_without_tag[test_without_tag.doc_sent == val]) - MAX_LEN)
        
test_without_tag['tag'] = output

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
100%|██████████| 17962/17962 [26:50<00:00, 11.18it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [29]:
test.loc[pd.notnull(test.tag) == False,'tag'] = test_without_tag.tag
test = test.drop_duplicates(['doc_sent','word','n1','n2'])

In [30]:
test.to_csv('../data/test_submission/bilstm_transformer.csv',index=False)