Forked from https://github.com/floydhub/named-entity-recognition-template/blob/master/ner.ipynb

In [1]:
import tensorflow as tf
import keras

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
import os, csv, math, codecs

import spacy

Using TensorFlow backend.


#### Data loading

Read training, dev and validation data. Dataset are in below format

document id | sentence number | word | NER tag


In [2]:
train = pd.read_csv("../data/subtask1_train.csv")
val = pd.read_csv("../data/subtask1_dev.csv")
test = pd.read_csv('../data/subtask1_test.csv')

In [3]:
train['doc_sent'] = train.apply(lambda x: "{}_{}".format(str(x['document']),str(x['sentence'])), axis=1)
val['doc_sent'] = val.apply(lambda x: "{}_{}".format(str(x['document']),str(x['sentence'])), axis=1)
test['doc_sent'] = test.apply(lambda x: "{}_{}".format(str(x['document']),str(x['sentence'])), axis=1)

In [4]:
train = train[train.tag != 'NULL']
val = val[val.tag != 'NULL']
train = train[pd.notnull(train.tag)]
val = val[pd.notnull(val.tag)]

In [5]:
import re
print (train.shape, val.shape)
#train['word'] = train['word'].apply(lambda x: re.sub(r'[^\w]','',x))
#train['wordlen'] = train.word.apply(lambda x: len(x))
#train = train[train.wordlen >= 2]
#print (train.shape)
#val['word'] = val['word'].apply(lambda x: re.sub(r'[^\w]','',x))
#val['wordlen'] = val.word.apply(lambda x: len(x))
#val = val[val.wordlen >= 2]
print (val.shape)
test.replace(np.nan,'',inplace=True)
#test['word'] = test['word'].apply(lambda x: x.lower())
#test['word'] = test['word'].apply(lambda x: re.sub(r'[^\w]','',str(x)))
#test['wordlen'] = test.word.apply(lambda x: len(x))
#test = test[test.wordlen >= 2]
print (test.shape)

(97244, 6) (49051, 8)
(49051, 8)
(872149, 7)


In [6]:
train.head(3)

Unnamed: 0,document,sentence,word,tag,pos,doc_sent
0,S0004-06142008000100008-1,0,presentamos,OTHER,VERB,S0004-06142008000100008-1_0
1,S0004-06142008000100008-1,0,caso,OTHER,NOUN,S0004-06142008000100008-1_0
2,S0004-06142008000100008-1,0,mujer,OTHER,NOUN,S0004-06142008000100008-1_0


In [7]:
test.head(5)

Unnamed: 0,document,sentence,n1,n2,word,pos,doc_sent
0,S0004-06142008000100008-1,0,0,11,presentamos,VERB,S0004-06142008000100008-1_0
1,S0004-06142008000100008-1,0,15,19,caso,NOUN,S0004-06142008000100008-1_0
2,S0004-06142008000100008-1,0,27,32,mujer,NOUN,S0004-06142008000100008-1_0
3,S0004-06142008000100008-1,0,36,38,30,NUM,S0004-06142008000100008-1_0
4,S0004-06142008000100008-1,0,39,43,años,NOUN,S0004-06142008000100008-1_0


In [8]:
test = pd.merge(test,pd.concat([train,val],axis=0),how='left')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [9]:
test.tag.value_counts()

OTHER               47127
NORMALIZABLES        1048
PROTEINAS             836
UNCLEAR                27
NO_NORMALIZABLES       13
Name: tag, dtype: int64

In [10]:
print (len(test[pd.notnull(test.tag)]), len(test))

49051 872149


In [11]:
train.groupby(['document','sentence'])['word'].nunique().describe()

count    2315.000000
mean       38.097624
std        33.411630
min         1.000000
25%        16.000000
50%        29.000000
75%        50.000000
max       359.000000
Name: word, dtype: float64

In [12]:
test.groupby(['document','sentence'])['word'].nunique().describe()

count    19048.000000
mean        41.185426
std         37.742500
min          1.000000
25%         17.000000
50%         31.000000
75%         53.000000
max        454.000000
Name: word, dtype: float64

In [13]:
val.groupby(['document','sentence'])['word'].nunique().describe()

count    1086.000000
mean       40.960405
std        34.282256
min         1.000000
25%        18.000000
50%        32.000000
75%        52.000000
max       264.000000
Name: word, dtype: float64

In [14]:
test.word.nunique()

80159

In [15]:
BATCH_SIZE = 32
EPOCHS = 20
MAX_LEN = 300
EMBEDDING = 300
MAX_NB_WORDS = 80000
n_tags = train.tag.nunique()
words = list(set(test.word))
n_words = len(words)
print (n_words, n_tags)

80159 5


In [16]:
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, BatchNormalization, GRU, CuDNNLSTM, CuDNNGRU
from keras_contrib.layers import CRF
from keras.callbacks import EarlyStopping, ModelCheckpoint

#### Data preprocessing for LSTM model

Convert the input sentences into sequence of words with maximum length as 300. For outputs we one hot encode. Additionally, we add 'PAD' to shorter input texts, as well as in the outputs for TAGs.

In [17]:
#words = list(set(train.word))
tags = list(set(train.tag))

word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1 # Unknown words
word2idx["PAD"] = 0 # Padding

# Vocabulary Key:token_index -> Value:word
idx2word = {i: w for w, i in word2idx.items()}

# Vocabulary Key:Label/Tag -> Value:tag_index
# The first entry is reserved for PAD
tag2idx = {t: i+1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0

# Vocabulary Key:tag_index -> Value:Label/Tag
idx2tag = {i: w for w, i in tag2idx.items()}

In [18]:
# Convert each sentence from list of Token to list of word_index
trainX = [[word2idx[w] for w in list(train[train.doc_sent == s].word)] for s in train.doc_sent.unique()]
# Padding each sentence to have the same lenght
trainX = pad_sequences(maxlen=MAX_LEN, sequences=trainX, padding="post", value=word2idx["PAD"])

valX = [[word2idx.get(w,1) for w in list(val[val.doc_sent == s].word)] for s in val.doc_sent.unique()]
# Padding each sentence to have the same lenght
valX = pad_sequences(maxlen=MAX_LEN, sequences=valX, padding="post", value=word2idx["PAD"])

trainy = [[tag2idx[w] for w in list(train[train.doc_sent == s].tag)] for s in train.doc_sent.unique()]
# Padding each sentence to have the same lenght
trainy = pad_sequences(maxlen=MAX_LEN, sequences=trainy, padding="post", value=word2idx["PAD"])

valy = [[tag2idx[w] for w in list(val[val.doc_sent == s].tag)] for s in val.doc_sent.unique()]
# Padding each sentence to have the same lenght
valy = pad_sequences(maxlen=MAX_LEN, sequences=valy, padding="post", value=word2idx["PAD"])

# One-Hot encode
trainy = [to_categorical(i, num_classes=n_tags+1) for i in trainy]  # n_tags+1(PAD)
valy = [to_categorical(i, num_classes=n_tags+1) for i in valy]  # n_tags+1(PAD)

print (np.array(trainX).shape, np.array(valX).shape, np.array(trainy).shape, np.array(valy).shape)

(2315, 300) (1086, 300) (2315, 300, 6) (1086, 300, 6)


In [19]:
print('loading word embeddings...')
embeddings_index = {}
f = codecs.open('/Users/victor/Documents/Models/cc.es.300.vec', encoding='utf-8')
for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

960it [00:00, 9594.02it/s]

loading word embeddings...


2000001it [02:50, 11727.05it/s]


In [20]:
print('preparing embedding matrix...')
EMBEDDING = 300
words_not_found = []
embedding_matrix = np.random.uniform(low=-.25,high=.25,size=(n_words+2, EMBEDDING))
for word, i in word2idx.items():
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print (len(words_not_found))

preparing embedding matrix...
42579


In [32]:
# Model definition
input = Input(shape=(MAX_LEN,))
model = Embedding(input_dim=n_words+2, output_dim=EMBEDDING, # n_words + 2 (PAD & UNK)
                  input_length=MAX_LEN, mask_zero=True)(input)  # default: 20-dim embedding
model = Bidirectional(LSTM(units=200, return_sequences=True))(model)  # variational biLSTM
#model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
#model = BatchNormalization()(model)
#model = Bidirectional(LSTM(units=100, activation='relu', return_sequences=True))(model)
model = TimeDistributed(Dense(50))(model)
crf = CRF(n_tags+1)  # CRF layer, n_tags+1(PAD)
out = crf(model)  # output

model = Model(input, out)
model.compile(optimizer='adam', loss=crf.loss_function, metrics=[crf.accuracy])

model.summary()



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 300)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 300, 300)          24048300  
_________________________________________________________________
bidirectional_3 (Bidirection (None, 300, 400)          801600    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 300, 50)           20050     
_________________________________________________________________
crf_3 (CRF)                  (None, 300, 6)            354       
Total params: 24,870,304
Trainable params: 24,870,304
Non-trainable params: 0
_________________________________________________________________


In [33]:
EPOCHS=15
#reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_crf_viterbi_accuracy', factor=0.2,
#                              patience=3, min_lr=0.005)
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0002, 
                                           patience=5, verbose=0, mode='min')
history = model.fit(np.array(trainX), np.array(trainy), batch_size=BATCH_SIZE, epochs=EPOCHS,verbose=2,
                    validation_data=(np.array(valX),np.array(valy)),
                   callbacks = [early_stop])

Train on 2315 samples, validate on 1086 samples
Epoch 1/15
 - 132s - loss: 32.5977 - crf_viterbi_accuracy: 0.8750 - val_loss: 26.2727 - val_crf_viterbi_accuracy: 0.9605
Epoch 2/15
 - 130s - loss: 32.2128 - crf_viterbi_accuracy: 0.9641 - val_loss: 26.1917 - val_crf_viterbi_accuracy: 0.9712
Epoch 3/15
 - 123s - loss: 32.1252 - crf_viterbi_accuracy: 0.9799 - val_loss: 26.1837 - val_crf_viterbi_accuracy: 0.9791
Epoch 4/15
 - 123s - loss: 32.1005 - crf_viterbi_accuracy: 0.9918 - val_loss: 26.1736 - val_crf_viterbi_accuracy: 0.9839
Epoch 5/15
 - 123s - loss: 32.0876 - crf_viterbi_accuracy: 0.9963 - val_loss: 26.1796 - val_crf_viterbi_accuracy: 0.9855
Epoch 6/15
 - 123s - loss: 32.0834 - crf_viterbi_accuracy: 0.9969 - val_loss: 26.1778 - val_crf_viterbi_accuracy: 0.9859
Epoch 7/15
 - 151s - loss: 32.0810 - crf_viterbi_accuracy: 0.9982 - val_loss: 26.1980 - val_crf_viterbi_accuracy: 0.9855
Epoch 8/15
 - 125s - loss: 32.0796 - crf_viterbi_accuracy: 0.9986 - val_loss: 26.1882 - val_crf_viterbi_a

In [34]:
with open("../models/model_bilstm_withoutfasttext.json", "w") as output:
    output.write(model.to_json())
    
model.save_weights("../models/model_bilstm_withoutfasttext.h5", overwrite=True)

In [35]:
pred_cat = model.predict(valX)
pred = np.argmax(pred_cat, axis=-1)
valy_true = np.argmax(valy, -1)

In [36]:
from sklearn_crfsuite.metrics import flat_classification_report

# Convert the index to tag
pred_tag = [[idx2tag[i] for i in row] for row in pred]
valy_true_tag = [[idx2tag[i] for i in row] for row in valy_true] 

report = flat_classification_report(y_pred=pred_tag, y_true=valy_true_tag)
print(report)

                  precision    recall  f1-score   support

   NORMALIZABLES       0.95      0.70      0.81      1048
NO_NORMALIZABLES       0.00      0.00      0.00        13
           OTHER       0.99      1.00      0.99     47099
             PAD       1.00      1.00      1.00    276777
       PROTEINAS       0.88      0.73      0.79       836
         UNCLEAR       0.80      0.30      0.43        27

       micro avg       1.00      1.00      1.00    325800
       macro avg       0.77      0.62      0.67    325800
    weighted avg       1.00      1.00      1.00    325800



In [None]:
val['tag_pred'] = ''
for i, value in enumerate(tqdm(val.doc_sent.unique())):
    if len(val[val.doc_sent == value]) <= MAX_LEN:
        val.loc[val.doc_sent == value,'tag_pred'] = pred_tag[i][:len(val[val.doc_sent == value])]
    else:
        val.loc[val.doc_sent == value,'tag_pred'] = pred_tag[i] + ['OTHER']*(len(val[val.doc_sent == value]) - MAX_LEN)
        
val.to_csv('../data/val_submission/bilstm_withoutfasttext.csv',index=False)

In [37]:
# Model definition
input = Input(shape=(MAX_LEN,))
model = Embedding(input_dim=n_words+2, output_dim=EMBEDDING,weights=[embedding_matrix],trainable=True, # n_words + 2 (PAD & UNK)
                  input_length=MAX_LEN, mask_zero=True)(input)  # default: 20-dim embedding
model = Bidirectional(LSTM(units=200, return_sequences=True))(model)  # variational biLSTM
#model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
#model = BatchNormalization()(model)
#model = Bidirectional(LSTM(units=100, activation='relu', return_sequences=True))(model)
model = TimeDistributed(Dense(50))(model)
crf = CRF(n_tags+1)  # CRF layer, n_tags+1(PAD)
out = crf(model)  # output

model = Model(input, out)
model.compile(optimizer='adam', loss=crf.loss_function, metrics=[crf.accuracy])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 300)               0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 300, 300)          24048300  
_________________________________________________________________
bidirectional_4 (Bidirection (None, 300, 400)          801600    
_________________________________________________________________
time_distributed_3 (TimeDist (None, 300, 50)           20050     
_________________________________________________________________
crf_4 (CRF)                  (None, 300, 6)            354       
Total params: 24,870,304
Trainable params: 24,870,304
Non-trainable params: 0
_________________________________________________________________


In [38]:
EPOCHS=15
#reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_crf_viterbi_accuracy', factor=0.2,
#                              patience=3, min_lr=0.005)
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0002, 
                                           patience=5, verbose=0, mode='min')
history = model.fit(np.array(trainX), np.array(trainy), batch_size=BATCH_SIZE, epochs=EPOCHS,verbose=2,
                    validation_data=(np.array(valX),np.array(valy)),
                   callbacks = [early_stop])

Train on 2315 samples, validate on 1086 samples
Epoch 1/15
 - 127s - loss: 32.3987 - crf_viterbi_accuracy: 0.9106 - val_loss: 26.2271 - val_crf_viterbi_accuracy: 0.9617
Epoch 2/15
 - 122s - loss: 32.1623 - crf_viterbi_accuracy: 0.9715 - val_loss: 26.1740 - val_crf_viterbi_accuracy: 0.9786
Epoch 3/15
 - 122s - loss: 32.1059 - crf_viterbi_accuracy: 0.9901 - val_loss: 26.1664 - val_crf_viterbi_accuracy: 0.9839
Epoch 4/15
 - 122s - loss: 32.0876 - crf_viterbi_accuracy: 0.9957 - val_loss: 26.1661 - val_crf_viterbi_accuracy: 0.9847
Epoch 5/15
 - 122s - loss: 32.0820 - crf_viterbi_accuracy: 0.9975 - val_loss: 26.1702 - val_crf_viterbi_accuracy: 0.9854
Epoch 6/15
 - 339s - loss: 32.0792 - crf_viterbi_accuracy: 0.9988 - val_loss: 26.1747 - val_crf_viterbi_accuracy: 0.9855
Epoch 7/15
 - 122s - loss: 32.0789 - crf_viterbi_accuracy: 0.9988 - val_loss: 26.1825 - val_crf_viterbi_accuracy: 0.9854
Epoch 8/15
 - 122s - loss: 32.0774 - crf_viterbi_accuracy: 0.9994 - val_loss: 26.1843 - val_crf_viterbi_a

In [39]:
with open("../models/model_bilstm_withfasttext.json", "w") as output:
    output.write(model.to_json())
    
model.save_weights("../models/model_bilstm_withfasttext.h5", overwrite=True)

In [40]:
pred_cat = model.predict(valX)
pred = np.argmax(pred_cat, axis=-1)
valy_true = np.argmax(valy, -1)

In [41]:
pred_tag = [[idx2tag[i] for i in row] for row in pred]
valy_true_tag = [[idx2tag[i] for i in row] for row in valy_true] 

report = flat_classification_report(y_pred=pred_tag, y_true=valy_true_tag)
print(report)

                  precision    recall  f1-score   support

   NORMALIZABLES       0.93      0.70      0.80      1048
NO_NORMALIZABLES       0.00      0.00      0.00        13
           OTHER       0.99      1.00      0.99     47099
             PAD       1.00      1.00      1.00    276777
       PROTEINAS       0.82      0.76      0.79       836
         UNCLEAR       0.71      0.37      0.49        27

       micro avg       1.00      1.00      1.00    325800
       macro avg       0.74      0.64      0.68    325800
    weighted avg       1.00      1.00      1.00    325800



In [42]:
val['tag_pred'] = ''
for i, value in enumerate(tqdm(val.doc_sent.unique())):
    if len(val[val.doc_sent == value]) <= MAX_LEN:
        val.loc[val.doc_sent == value,'tag_pred'] = pred_tag[i][:len(val[val.doc_sent == value])]
    else:
        val.loc[val.doc_sent == value,'tag_pred'] = pred_tag[i] + ['OTHER']*(len(val[val.doc_sent == value]) - MAX_LEN)
        
val.to_csv('../data/val_submission/bilstm_withfasttext.csv',index=False)


100%|██████████| 1086/1086 [00:15<00:00, 69.33it/s]


#### Testing

In [21]:
# Model definition
input = Input(shape=(MAX_LEN,))
model = Embedding(input_dim=n_words+2, output_dim=EMBEDDING,weights=[embedding_matrix],trainable=True, # n_words + 2 (PAD & UNK)
                  input_length=MAX_LEN, mask_zero=True)(input)  # default: 20-dim embedding
model = Bidirectional(LSTM(units=200, return_sequences=True))(model)  # variational biLSTM
#model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
#model = BatchNormalization()(model)
#model = Bidirectional(LSTM(units=100, activation='relu', return_sequences=True))(model)
model = TimeDistributed(Dense(50))(model)
crf = CRF(n_tags+1)  # CRF layer, n_tags+1(PAD)
out = crf(model)  # output

model = Model(input, out)
model.compile(optimizer='adam', loss=crf.loss_function, metrics=[crf.accuracy])

model.summary()

Instructions for updating:
Colocations handled automatically by placer.




_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 300)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 300, 300)          24048300  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 300, 400)          801600    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 300, 50)           20050     
_________________________________________________________________
crf_1 (CRF)                  (None, 300, 6)            354       
Total params: 24,870,304
Trainable params: 24,870,304
Non-trainable params: 0
_________________________________________________________________


In [22]:
model.load_weights('../models/model_bilstm_withfasttext.h5')

In [25]:
train_valX = np.concatenate([np.array(trainX),np.array(valX)],axis=0)
train_valy = np.concatenate([np.array(trainy),np.array(valy)],axis=0)

print (train_valX.shape, train_valy.shape)

(3401, 300) (3401, 300, 6)


In [27]:
EPOCHS=15
#reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_crf_viterbi_accuracy', factor=0.2,
#                              patience=3, min_lr=0.005)
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0002, 
                                           patience=5, verbose=0, mode='min')

history = model.fit(train_valX, train_valy, batch_size=BATCH_SIZE, epochs=EPOCHS,verbose=2,validation_split=.1,
                   callbacks = [early_stop])

Train on 3060 samples, validate on 341 samples
Epoch 1/15
 - 131s - loss: 31.3312 - crf_viterbi_accuracy: 0.8797 - val_loss: 26.0546 - val_crf_viterbi_accuracy: 0.9648
Epoch 2/15
 - 125s - loss: 30.7561 - crf_viterbi_accuracy: 0.9680 - val_loss: 25.9855 - val_crf_viterbi_accuracy: 0.9774
Epoch 3/15
 - 124s - loss: 30.6855 - crf_viterbi_accuracy: 0.9861 - val_loss: 25.9672 - val_crf_viterbi_accuracy: 0.9839
Epoch 4/15
 - 124s - loss: 30.6582 - crf_viterbi_accuracy: 0.9950 - val_loss: 25.9667 - val_crf_viterbi_accuracy: 0.9865
Epoch 5/15
 - 124s - loss: 30.6490 - crf_viterbi_accuracy: 0.9975 - val_loss: 25.9676 - val_crf_viterbi_accuracy: 0.9874
Epoch 6/15
 - 123s - loss: 30.6454 - crf_viterbi_accuracy: 0.9986 - val_loss: 25.9723 - val_crf_viterbi_accuracy: 0.9878
Epoch 7/15
 - 124s - loss: 30.6434 - crf_viterbi_accuracy: 0.9990 - val_loss: 25.9761 - val_crf_viterbi_accuracy: 0.9876
Epoch 8/15
 - 124s - loss: 30.6427 - crf_viterbi_accuracy: 0.9993 - val_loss: 25.9829 - val_crf_viterbi_ac

In [28]:
test_without_tag = test[pd.notnull(test.tag) == False]

testX = np.load('../data/testX.npy')
'''
testX = [[word2idx.get(w,1) for w in list(test_without_tag[test_without_tag.doc_sent == s].word)] for s in test_without_tag.doc_sent.unique()]
# Padding each sentence to have the same lenght
testX = pad_sequences(maxlen=MAX_LEN, sequences=testX, padding="post", value=word2idx["PAD"])
'''
print (testX.shape)

(17962, 300)


In [29]:
pred_cat = model.predict(testX)
pred = np.argmax(pred_cat, axis=-1)
pred_tag = [[idx2tag[i] for i in row] for row in pred]

In [30]:
test_without_tag['tag'] = ''
for i, val in enumerate(tqdm(test_without_tag.doc_sent.unique())):
    if len(test_without_tag[test_without_tag.doc_sent == val]) <= MAX_LEN:
        test_without_tag.loc[test_without_tag.doc_sent == val,'tag'] = pred_tag[i][:len(test_without_tag[test_without_tag.doc_sent == val])]
    else:
        test_without_tag.loc[test_without_tag.doc_sent == val,'tag'] = pred_tag[i] + ['OTHER']*(len(test_without_tag[test_without_tag.doc_sent == val]) - MAX_LEN)
        
        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
100%|██████████| 17962/17962 [2:45:30<00:00,  2.89it/s]    


In [31]:
test.loc[pd.notnull(test.tag) == False,'tag'] = test_without_tag.tag
test = test.drop_duplicates(['doc_sent','word','n1','n2'])

In [32]:
model.save_weights("../models/model_bilstm_withfasttext_test.h5", overwrite=True)

In [33]:
test.to_csv('../data/test_submission/bilstm.csv',index=False)