In [1]:
import tensorflow as tf
import keras

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
import os, csv, math, codecs

import spacy

Using TensorFlow backend.


#### Data loading

Read training, dev and validation data. Dataset are in below format

document id | sentence number | word | NER tag


In [2]:
train = pd.read_csv("../data/subtask1_train.csv")
val = pd.read_csv("../data/subtask1_dev.csv")
test = pd.read_csv('../data/subtask1_test.csv')

In [3]:
train['doc_sent'] = train.apply(lambda x: "{}_{}".format(str(x['document']),str(x['sentence'])), axis=1)
val['doc_sent'] = val.apply(lambda x: "{}_{}".format(str(x['document']),str(x['sentence'])), axis=1)
test['doc_sent'] = test.apply(lambda x: "{}_{}".format(str(x['document']),str(x['sentence'])), axis=1)

In [4]:
train = train[train.tag != 'NULL']
val = val[val.tag != 'NULL']
train = train[pd.notnull(train.tag)]
val = val[pd.notnull(val.tag)]

In [5]:
import re
print (train.shape, val.shape)
#train['word'] = train['word'].apply(lambda x: re.sub(r'[^\w]','',x))
#train['wordlen'] = train.word.apply(lambda x: len(x))
#train = train[train.wordlen >= 2]
#print (train.shape)
#val['word'] = val['word'].apply(lambda x: re.sub(r'[^\w]','',x))
#val['wordlen'] = val.word.apply(lambda x: len(x))
#val = val[val.wordlen >= 2]
print (val.shape)
test.replace(np.nan,'',inplace=True)
#test['word'] = test['word'].apply(lambda x: x.lower())
#test['word'] = test['word'].apply(lambda x: re.sub(r'[^\w]','',str(x)))
#test['wordlen'] = test.word.apply(lambda x: len(x))
#test = test[test.wordlen >= 2]
print (test.shape)

(97244, 6) (49051, 8)
(49051, 8)
(872149, 7)


In [6]:
train.head(3)

Unnamed: 0,document,sentence,word,tag,pos,doc_sent
0,S0004-06142008000100008-1,0,presentamos,OTHER,VERB,S0004-06142008000100008-1_0
1,S0004-06142008000100008-1,0,caso,OTHER,NOUN,S0004-06142008000100008-1_0
2,S0004-06142008000100008-1,0,mujer,OTHER,NOUN,S0004-06142008000100008-1_0


In [7]:
test.head(5)

Unnamed: 0,document,sentence,n1,n2,word,pos,doc_sent
0,S0004-06142008000100008-1,0,0,11,presentamos,VERB,S0004-06142008000100008-1_0
1,S0004-06142008000100008-1,0,15,19,caso,NOUN,S0004-06142008000100008-1_0
2,S0004-06142008000100008-1,0,27,32,mujer,NOUN,S0004-06142008000100008-1_0
3,S0004-06142008000100008-1,0,36,38,30,NUM,S0004-06142008000100008-1_0
4,S0004-06142008000100008-1,0,39,43,años,NOUN,S0004-06142008000100008-1_0


In [8]:
test = pd.merge(test,pd.concat([train,val],axis=0),how='left')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [9]:
test.tag.value_counts()

OTHER               47127
NORMALIZABLES        1048
PROTEINAS             836
UNCLEAR                27
NO_NORMALIZABLES       13
Name: tag, dtype: int64

In [10]:
print (len(test[pd.notnull(test.tag)]), len(test))

49051 872149


In [11]:
train.groupby(['document','sentence'])['word'].nunique().describe()

count    2315.000000
mean       38.097624
std        33.411630
min         1.000000
25%        16.000000
50%        29.000000
75%        50.000000
max       359.000000
Name: word, dtype: float64

In [12]:
test.groupby(['document','sentence'])['word'].nunique().describe()

count    19048.000000
mean        41.185426
std         37.742500
min          1.000000
25%         17.000000
50%         31.000000
75%         53.000000
max        454.000000
Name: word, dtype: float64

In [13]:
val.groupby(['document','sentence'])['word'].nunique().describe()

count    1086.000000
mean       40.960405
std        34.282256
min         1.000000
25%        18.000000
50%        32.000000
75%        52.000000
max       264.000000
Name: word, dtype: float64

In [14]:
test.word.nunique()

80159

In [15]:
BATCH_SIZE = 32
EPOCHS = 20
MAX_LEN = 300
EMBEDDING = 300
MAX_NB_WORDS = 80000
n_tags = train.tag.nunique()
words = list(set(test.word))
n_words = len(words)
print (n_words, n_tags)

80159 5


In [16]:
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, BatchNormalization, GRU, CuDNNLSTM, CuDNNGRU, Concatenate
from keras_contrib.layers import CRF
from keras.callbacks import EarlyStopping, ModelCheckpoint

#### Data preprocessing for LSTM model

Convert the input sentences into sequence of words with maximum length as 300. For outputs we one hot encode. Additionally, we add 'PAD' to shorter input texts, as well as in the outputs for TAGs.

In [17]:
#words = list(set(train.word))
poss = list(set(test.pos))
tags = list(set(train.tag))

word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1 # Unknown words
word2idx["PAD"] = 0 # Padding

pos2idx = {w: i + 2 for i, w in enumerate(poss)}
pos2idx["UNK"] = 1 # Padding
pos2idx["PAD"] = 0 # Padding

# Vocabulary Key:token_index -> Value:word
idx2word = {i: w for w, i in word2idx.items()}

idx2pos = {i: w for w, i in pos2idx.items()}

# Vocabulary Key:Label/Tag -> Value:tag_index
# The first entry is reserved for PAD
tag2idx = {t: i+1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0

# Vocabulary Key:tag_index -> Value:Label/Tag
idx2tag = {i: w for w, i in tag2idx.items()}

In [18]:
pos2idx

{'SCONJ': 2,
 'NUM': 3,
 'PROPN': 4,
 'SPACE': 5,
 'VERB': 6,
 'INTJ': 7,
 'CONJ': 8,
 'PUNCT': 9,
 'PRON': 10,
 'ADJ': 11,
 'DET': 12,
 'NOUN': 13,
 'SYM': 14,
 'ADV': 15,
 'ADP': 16,
 'AUX': 17,
 'UNK': 1,
 'PAD': 0}

In [19]:
# Convert each sentence from list of Token to list of word_index
trainX = [[word2idx[w] for w in list(train[train.doc_sent == s].word)] for s in train.doc_sent.unique()]
# Padding each sentence to have the same lenght
trainX = pad_sequences(maxlen=MAX_LEN, sequences=trainX, padding="post", value=word2idx["PAD"])

valX = [[word2idx.get(w,1) for w in list(val[val.doc_sent == s].word)] for s in val.doc_sent.unique()]
# Padding each sentence to have the same lenght
valX = pad_sequences(maxlen=MAX_LEN, sequences=valX, padding="post", value=word2idx["PAD"])

trainX_pos = [[pos2idx[w] for w in list(train[train.doc_sent == s].pos)] for s in train.doc_sent.unique()]
# Padding each sentence to have the same lenght
trainX_pos = pad_sequences(maxlen=MAX_LEN, sequences=trainX_pos, padding="post", value=pos2idx["PAD"])

valX_pos = [[pos2idx.get(w,1) for w in list(val[val.doc_sent == s].pos)] for s in val.doc_sent.unique()]
# Padding each sentence to have the same lenght
valX_pos = pad_sequences(maxlen=MAX_LEN, sequences=valX_pos, padding="post", value=pos2idx["PAD"])

trainy = [[tag2idx[w] for w in list(train[train.doc_sent == s].tag)] for s in train.doc_sent.unique()]
# Padding each sentence to have the same lenght
trainy = pad_sequences(maxlen=MAX_LEN, sequences=trainy, padding="post", value=word2idx["PAD"])

valy = [[tag2idx[w] for w in list(val[val.doc_sent == s].tag)] for s in val.doc_sent.unique()]
# Padding each sentence to have the same lenght
valy = pad_sequences(maxlen=MAX_LEN, sequences=valy, padding="post", value=word2idx["PAD"])

# One-Hot encode
trainy = [to_categorical(i, num_classes=n_tags+1) for i in trainy]  # n_tags+1(PAD)
valy = [to_categorical(i, num_classes=n_tags+1) for i in valy]  # n_tags+1(PAD)

print (np.array(trainX).shape, np.array(valX).shape, np.array(trainy).shape, np.array(valy).shape)

(2315, 300) (1086, 300) (2315, 300, 6) (1086, 300, 6)


In [25]:
print('loading word embeddings...')
embeddings_index = {}
f = codecs.open('/Users/victor/Documents/Models/cc.es.300.vec', encoding='utf-8')
for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

1853it [00:00, 8736.63it/s]

loading word embeddings...


2000001it [02:38, 12628.75it/s]


In [26]:
print('preparing embedding matrix...')
EMBEDDING = 300
words_not_found = []
embedding_matrix = np.random.uniform(low=-.25,high=.25,size=(n_words+2, EMBEDDING))
for word, i in word2idx.items():
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print (len(words_not_found))

preparing embedding matrix...
42579


In [33]:
# Model definition
input1 = Input(shape=(MAX_LEN,))
in_word = Embedding(input_dim=n_words+2, output_dim=EMBEDDING, # n_words + 2 (PAD & UNK)
                  input_length=MAX_LEN, mask_zero=True)(input1)  # default: 20-dim embedding

input2 = Input(shape=(MAX_LEN,))
in_pos = Embedding(input_dim=len(pos2idx), output_dim=EMBEDDING, # n_words + 2 (PAD & UNK)
                  input_length=MAX_LEN, mask_zero=True)(input2)  # default: 20-dim embedding

model1 = Bidirectional(LSTM(units=100, return_sequences=True))(in_word)  # variational biLSTM
model2 = Bidirectional(LSTM(units=100, return_sequences=True))(in_pos)

model = Concatenate()([model1,model2])

#model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
#model = BatchNormalization()(model)
#model = Bidirectional(LSTM(units=100, activation='relu', return_sequences=True))(model)
model = TimeDistributed(Dense(50))(model)
crf = CRF(n_tags+1)  # CRF layer, n_tags+1(PAD)
out = crf(model)  # output

model = Model([input1,input2], out)
model.compile(optimizer='adam', loss=crf.loss_function, metrics=[crf.accuracy])

model.summary()



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            (None, 300)          0                                            
__________________________________________________________________________________________________
input_10 (InputLayer)           (None, 300)          0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 300, 300)     24048300    input_9[0][0]                    
__________________________________________________________________________________________________
embedding_10 (Embedding)        (None, 300, 300)     5400        input_10[0][0]                   
__________________________________________________________________________________________________
bidirectio

In [34]:
EPOCHS=15
#reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_crf_viterbi_accuracy', factor=0.2,
#                              patience=3, min_lr=0.005)
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0002, 
                                           patience=5, verbose=0, mode='min')
history = model.fit([np.array(trainX),np.array(trainX_pos)], np.array(trainy), batch_size=BATCH_SIZE, epochs=EPOCHS,verbose=2,
                    validation_data=([np.array(valX),np.array(valX_pos)],np.array(valy)),
                   callbacks = [early_stop])

Instructions for updating:
Use tf.cast instead.
Train on 2315 samples, validate on 1086 samples
Epoch 1/15
 - 109s - loss: 32.3614 - crf_viterbi_accuracy: 0.9490 - val_loss: 26.2058 - val_crf_viterbi_accuracy: 0.9650
Epoch 2/15
 - 102s - loss: 32.1360 - crf_viterbi_accuracy: 0.9788 - val_loss: 26.1591 - val_crf_viterbi_accuracy: 0.9836
Epoch 3/15
 - 98s - loss: 32.0930 - crf_viterbi_accuracy: 0.9937 - val_loss: 26.1645 - val_crf_viterbi_accuracy: 0.9854
Epoch 4/15
 - 99s - loss: 32.0829 - crf_viterbi_accuracy: 0.9974 - val_loss: 26.1678 - val_crf_viterbi_accuracy: 0.9864
Epoch 5/15
 - 99s - loss: 32.0801 - crf_viterbi_accuracy: 0.9983 - val_loss: 26.1785 - val_crf_viterbi_accuracy: 0.9865
Epoch 6/15
 - 101s - loss: 32.0782 - crf_viterbi_accuracy: 0.9991 - val_loss: 26.1837 - val_crf_viterbi_accuracy: 0.9865
Epoch 7/15
 - 105s - loss: 32.0774 - crf_viterbi_accuracy: 0.9994 - val_loss: 26.1827 - val_crf_viterbi_accuracy: 0.9865


In [35]:
with open("../models/model_bilstm_pos_withoutfasttext.json", "w") as output:
    output.write(model.to_json())
    
model.save_weights("../models/model_bilstm_pos_withoutfasttext.h5", overwrite=True)

In [36]:
pred_cat = model.predict([valX,valX_pos])
pred = np.argmax(pred_cat, axis=-1)
valy_true = np.argmax(valy, -1)

In [37]:
from sklearn_crfsuite.metrics import flat_classification_report

# Convert the index to tag
pred_tag = [[idx2tag[i] for i in row] for row in pred]
valy_true_tag = [[idx2tag[i] for i in row] for row in valy_true] 

report = flat_classification_report(y_pred=pred_tag, y_true=valy_true_tag)
print(report)

  'precision', 'predicted', average, warn_for)


                  precision    recall  f1-score   support

   NORMALIZABLES       0.96      0.69      0.80      1048
NO_NORMALIZABLES       0.00      0.00      0.00        13
           OTHER       0.99      1.00      0.99     47099
             PAD       1.00      1.00      1.00    276777
       PROTEINAS       0.87      0.73      0.79       836
         UNCLEAR       0.46      0.22      0.30        27

       micro avg       1.00      1.00      1.00    325800
       macro avg       0.71      0.61      0.65    325800
    weighted avg       1.00      1.00      1.00    325800



In [38]:
val['tag_pred'] = ''
for i, value in enumerate(tqdm(val.doc_sent.unique())):
    if len(val[val.doc_sent == value]) <= MAX_LEN:
        val.loc[val.doc_sent == value,'tag_pred'] = pred_tag[i][:len(val[val.doc_sent == value])]
    else:
        val.loc[val.doc_sent == value,'tag_pred'] = pred_tag[i] + ['OTHER']*(len(val[val.doc_sent == value]) - MAX_LEN)
        
val.to_csv('../data/val_submission/bilstm_pos_withoutfasttext.csv',index=False)


100%|██████████| 1086/1086 [00:29<00:00, 36.31it/s]


In [40]:
# Model definition
input1 = Input(shape=(MAX_LEN,))
in_word = Embedding(input_dim=n_words+2, output_dim=EMBEDDING,weights=[embedding_matrix],trainable=True, # n_words + 2 (PAD & UNK)
                  input_length=MAX_LEN, mask_zero=True)(input1)  # default: 20-dim embedding

input2 = Input(shape=(MAX_LEN,))
in_pos = Embedding(input_dim=len(pos2idx), output_dim=EMBEDDING, # n_words + 2 (PAD & UNK)
                  input_length=MAX_LEN, mask_zero=True)(input2)  # default: 20-dim embedding

model1 = Bidirectional(LSTM(units=100, return_sequences=True))(in_word)  # variational biLSTM
model2 = Bidirectional(LSTM(units=100, return_sequences=True))(in_pos)

model = Concatenate()([model1,model2])

#model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
#model = BatchNormalization()(model)
#model = Bidirectional(LSTM(units=100, activation='relu', return_sequences=True))(model)
model = TimeDistributed(Dense(50))(model)
crf = CRF(n_tags+1)  # CRF layer, n_tags+1(PAD)
out = crf(model)  # output

model = Model([input1,input2], out)
model.compile(optimizer='adam', loss=crf.loss_function, metrics=[crf.accuracy])

model.summary()



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           (None, 300)          0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           (None, 300)          0                                            
__________________________________________________________________________________________________
embedding_11 (Embedding)        (None, 300, 300)     24048300    input_11[0][0]                   
__________________________________________________________________________________________________
embedding_12 (Embedding)        (None, 300, 300)     5400        input_12[0][0]                   
__________________________________________________________________________________________________
bidirectio

In [41]:
EPOCHS=15
#reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_crf_viterbi_accuracy', factor=0.2,
#                              patience=3, min_lr=0.005)
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0002, 
                                           patience=5, verbose=0, mode='min')
history = model.fit([np.array(trainX),np.array(trainX_pos)], np.array(trainy), batch_size=BATCH_SIZE, epochs=EPOCHS,verbose=2,
                    validation_data=([np.array(valX),np.array(valX_pos)],np.array(valy)),
                   callbacks = [early_stop])

Train on 2315 samples, validate on 1086 samples
Epoch 1/15
 - 112s - loss: 32.3184 - crf_viterbi_accuracy: 0.9501 - val_loss: 26.2071 - val_crf_viterbi_accuracy: 0.9615
Epoch 2/15
 - 97s - loss: 32.1420 - crf_viterbi_accuracy: 0.9749 - val_loss: 26.1531 - val_crf_viterbi_accuracy: 0.9819
Epoch 3/15
 - 98s - loss: 32.0971 - crf_viterbi_accuracy: 0.9924 - val_loss: 26.1451 - val_crf_viterbi_accuracy: 0.9853
Epoch 4/15
 - 94s - loss: 32.0845 - crf_viterbi_accuracy: 0.9969 - val_loss: 26.1549 - val_crf_viterbi_accuracy: 0.9862
Epoch 5/15
 - 94s - loss: 32.0806 - crf_viterbi_accuracy: 0.9981 - val_loss: 26.1499 - val_crf_viterbi_accuracy: 0.9864
Epoch 6/15
 - 98s - loss: 32.0789 - crf_viterbi_accuracy: 0.9987 - val_loss: 26.1603 - val_crf_viterbi_accuracy: 0.9868
Epoch 7/15
 - 94s - loss: 32.0773 - crf_viterbi_accuracy: 0.9994 - val_loss: 26.1614 - val_crf_viterbi_accuracy: 0.9868
Epoch 8/15
 - 94s - loss: 32.0766 - crf_viterbi_accuracy: 0.9997 - val_loss: 26.1648 - val_crf_viterbi_accuracy

In [42]:
with open("../models/model_bilstm_pos_withfasttext.json", "w") as output:
    output.write(model.to_json())
    
model.save_weights("../models/model_bilstm_pos_withfasttext.h5", overwrite=True)

In [43]:
pred_cat = model.predict([valX,valX_pos])
pred = np.argmax(pred_cat, axis=-1)
valy_true = np.argmax(valy, -1)

In [44]:
pred_tag = [[idx2tag[i] for i in row] for row in pred]
valy_true_tag = [[idx2tag[i] for i in row] for row in valy_true] 

report = flat_classification_report(y_pred=pred_tag, y_true=valy_true_tag)
print(report)

                  precision    recall  f1-score   support

   NORMALIZABLES       0.94      0.73      0.82      1048
NO_NORMALIZABLES       0.00      0.00      0.00        13
           OTHER       0.99      1.00      0.99     47099
             PAD       1.00      1.00      1.00    276777
       PROTEINAS       0.84      0.75      0.80       836
         UNCLEAR       0.38      0.37      0.38        27

       micro avg       1.00      1.00      1.00    325800
       macro avg       0.69      0.64      0.66    325800
    weighted avg       1.00      1.00      1.00    325800



In [45]:
val['tag_pred'] = ''
for i, value in enumerate(tqdm(val.doc_sent.unique())):
    if len(val[val.doc_sent == value]) <= MAX_LEN:
        val.loc[val.doc_sent == value,'tag_pred'] = pred_tag[i][:len(val[val.doc_sent == value])]
    else:
        val.loc[val.doc_sent == value,'tag_pred'] = pred_tag[i] + ['OTHER']*(len(val[val.doc_sent == value]) - MAX_LEN)
        
val.to_csv('../data/val_submission/bilstm_pos_withfasttext.csv',index=False)


100%|██████████| 1086/1086 [00:15<00:00, 70.80it/s]


#### Testing

In [20]:
# Model definition
input1 = Input(shape=(MAX_LEN,))
in_word = Embedding(input_dim=n_words+2, output_dim=EMBEDDING, # n_words + 2 (PAD & UNK)
                  input_length=MAX_LEN, mask_zero=True)(input1)  # default: 20-dim embedding

input2 = Input(shape=(MAX_LEN,))
in_pos = Embedding(input_dim=len(pos2idx), output_dim=EMBEDDING, # n_words + 2 (PAD & UNK)
                  input_length=MAX_LEN, mask_zero=True)(input2)  # default: 20-dim embedding

model1 = Bidirectional(LSTM(units=100, return_sequences=True))(in_word)  # variational biLSTM
model2 = Bidirectional(LSTM(units=100, return_sequences=True))(in_pos)

model = Concatenate()([model1,model2])

#model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
#model = BatchNormalization()(model)
#model = Bidirectional(LSTM(units=100, activation='relu', return_sequences=True))(model)
model = TimeDistributed(Dense(50))(model)
crf = CRF(n_tags+1)  # CRF layer, n_tags+1(PAD)
out = crf(model)  # output

model = Model([input1,input2], out)
model.compile(optimizer='adam', loss=crf.loss_function, metrics=[crf.accuracy])

model.summary()

Instructions for updating:
Colocations handled automatically by placer.




__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 300)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 300)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 300, 300)     24048300    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 300, 300)     5400        input_2[0][0]                    
__________________________________________________________________________________________________
bidirectio

In [21]:
train_valX = np.concatenate([np.array(trainX),np.array(valX)],axis=0)
train_val_posX = np.concatenate([np.array(trainX_pos),np.array(valX_pos)],axis=0)
train_valy = np.concatenate([np.array(trainy),np.array(valy)],axis=0)

print (train_valX.shape, train_val_posX.shape, train_valy.shape)

(3401, 300) (3401, 300) (3401, 300, 6)


In [23]:
EPOCHS=15
#reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_crf_viterbi_accuracy', factor=0.2,
#                              patience=3, min_lr=0.005)
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0002, 
                                           patience=5, verbose=0, mode='min')
history = model.fit([np.array(train_valX),np.array(train_val_posX)], np.array(train_valy), batch_size=BATCH_SIZE, epochs=EPOCHS,verbose=2,
                    validation_split=.1,
                   callbacks = [early_stop])

Train on 3060 samples, validate on 341 samples
Epoch 1/15
 - 141s - loss: 30.6966 - crf_viterbi_accuracy: 0.9806 - val_loss: 25.9425 - val_crf_viterbi_accuracy: 0.9882
Epoch 2/15
 - 137s - loss: 30.6562 - crf_viterbi_accuracy: 0.9945 - val_loss: 25.9488 - val_crf_viterbi_accuracy: 0.9895
Epoch 3/15
 - 135s - loss: 30.6472 - crf_viterbi_accuracy: 0.9976 - val_loss: 25.9523 - val_crf_viterbi_accuracy: 0.9904
Epoch 4/15
 - 136s - loss: 30.6448 - crf_viterbi_accuracy: 0.9983 - val_loss: 25.9616 - val_crf_viterbi_accuracy: 0.9902
Epoch 5/15
 - 135s - loss: 30.6435 - crf_viterbi_accuracy: 0.9989 - val_loss: 25.9678 - val_crf_viterbi_accuracy: 0.9897
Epoch 6/15
 - 135s - loss: 30.6427 - crf_viterbi_accuracy: 0.9993 - val_loss: 25.9686 - val_crf_viterbi_accuracy: 0.9902


In [24]:
test_without_tag = test[pd.notnull(test.tag) == False]

testX = np.load('../data/testX.npy')
'''
testX = [[word2idx.get(w,1) for w in list(test_without_tag[test_without_tag.doc_sent == s].word)] for s in test_without_tag.doc_sent.unique()]
# Padding each sentence to have the same lenght
testX = pad_sequences(maxlen=MAX_LEN, sequences=testX, padding="post", value=word2idx["PAD"])
'''
testX_pos = [[word2idx.get(w,1) for w in list(test_without_tag[test_without_tag.doc_sent == s].pos)] for s in test_without_tag.doc_sent.unique()]
# Padding each sentence to have the same lenght
testX_pos = pad_sequences(maxlen=MAX_LEN, sequences=testX_pos, padding="post", value=pos2idx["PAD"])

np.save('../data/testX_pos',testX_pos)

print (testX.shape,testX_pos.shape)

(17962, 300) (17962, 300)


In [25]:
pred_cat = model.predict([testX,testX_pos])
pred = np.argmax(pred_cat, axis=-1)
pred_tag = [[idx2tag[i] for i in row] for row in pred]

In [29]:
test_without_tag['tag'] = ''
output = []
for i, val in enumerate(tqdm(test_without_tag.doc_sent.unique())):
    if len(test_without_tag[test_without_tag.doc_sent == val]) <= MAX_LEN:
        output += pred_tag[i][:len(test_without_tag[test_without_tag.doc_sent == val])]
    else:
        output += pred_tag[i] + ['OTHER']*(len(test_without_tag[test_without_tag.doc_sent == val]) - MAX_LEN)
        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
100%|██████████| 17962/17962 [44:26<00:00,  5.27it/s]   


In [30]:
test_without_tag['tag'] = output
test.loc[pd.notnull(test.tag) == False,'tag'] = test_without_tag.tag
test = test.drop_duplicates(['doc_sent','word','n1','n2'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [32]:
model.save_weights("../models/model_bilstm_pos_withoutfasttext_test.h5", overwrite=True)

In [33]:
test.to_csv('../data/test_submission/bilstm_tag_crf.csv',index=False)