# Deep Learning based Named Entity Recognition from Scratch : Disease Extraction Hackathon

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import unicodedata
 
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense
from keras.layers import TimeDistributed, Dropout, Bidirectional
 
# Defining Constants
 
# Maximum length of text sentences
MAXLEN = 180
# Number of LSTM units
LSTM_N = 150
# batch size
BS=48

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# Reading the training set
data = pd.read_csv("train.csv", encoding="latin1")
data.head(10)

Unnamed: 0,id,Doc_ID,Sent_ID,Word,tag
0,1,1,1,Obesity,O
1,2,1,1,in,O
2,3,1,1,Low-,O
3,4,1,1,and,O
4,5,1,1,Middle-Income,O
5,6,1,1,Countries,O
6,7,1,1,:,O
7,8,1,1,Burden,O
8,9,1,1,",",O
9,10,1,1,Drivers,O


In [3]:
test_data = pd.read_csv("test.csv", encoding="latin1")
test_data.head(10)

Unnamed: 0,id,Doc_ID,Sent_ID,Word
0,4543834,30001,191283,CCCVA
1,4543835,30001,191283,","
2,4543836,30001,191283,MANOVA
3,4543837,30001,191283,","
4,4543838,30001,191283,my
5,4543839,30001,191283,black
6,4543840,30001,191283,hen
7,4543841,30001,191283,.
8,4543842,30001,191284,Comments
9,4543843,30001,191284,on


##  Creating Word & Tag dictionary

In [5]:
print("Number of uniques docs, sentences and words in Training set:\n",data.nunique())
print("\nNumber of uniques docs, sentences and words in Test set:\n",test_data.nunique())
 
# Creating a vocabulary
words = list(set(data["Word"].append(test_data["Word"]).values))
words.append("ENDPAD")
 
# Converting greek characters to ASCII characters eg. 'naïve café' to 'naive cafe'
words = [unicodedata.normalize('NFKD', str(w)).encode('ascii','ignore') for w in words]
n_words = len(words)
print("\nLength of vocabulary = ",n_words)
 
tags = list(set(data["tag"].values))
n_tags = len(tags)
print("\nnumber of tags = ",n_tags)
 
# Creating words to indices dictionary.
word2idx = {w: i for i, w in enumerate(words)}
# Creating tags to indices dictionary.
tag2idx = {t: i for i, t in enumerate(tags)}

Number of uniques docs, sentences and words in Training set:
 id         4543833
Doc_ID       30000
Sent_ID     191282
Word        184505
tag              3
dtype: int64

Number of uniques docs, sentences and words in Test set:
 id         2994463
Doc_ID       20000
Sent_ID     125840
Word        139891
dtype: int64

Length of vocabulary =  257203

number of tags =  3


## Getting Train & Test Sentences

In [8]:
def get_tagged_sentences(data):
    agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(), s["tag"].values.tolist())]
    grouped = data.groupby("Sent_ID").apply(agg_func)
    sentences = [s for s in grouped]
    return sentences
 
def get_test_sentences(data):
    agg_func = lambda s: [w for w in s["Word"].values.tolist()]
    grouped = data.groupby("Sent_ID").apply(agg_func)
    sentences = [s for s in grouped]
    return sentences
# Getting training sentences in a list
sentences = get_tagged_sentences(data)
print("First 2 sentences in a word list format:\n",sentences[0:2])

First 2 sentences in a word list format:
 [[('Obesity', 'O'), ('in', 'O'), ('Low-', 'O'), ('and', 'O'), ('Middle-Income', 'O'), ('Countries', 'O'), (':', 'O'), ('Burden', 'O'), (',', 'O'), ('Drivers', 'O'), (',', 'O'), ('and', 'O'), ('Emerging', 'O'), ('Challenges', 'O'), ('.', 'O')], [('We', 'O'), ('have', 'O'), ('reviewed', 'O'), ('the', 'O'), ('distinctive', 'O'), ('features', 'O'), ('of', 'O'), ('excess', 'O'), ('weight', 'O'), (',', 'O'), ('its', 'O'), ('causes', 'O'), (',', 'O'), ('and', 'O'), ('related', 'O'), ('prevention', 'O'), ('and', 'O'), ('management', 'O'), ('efforts', 'O'), (',', 'O'), ('as', 'O'), ('well', 'O'), ('as', 'O'), ('data', 'O'), ('gaps', 'O'), ('and', 'O'), ('recommendations', 'O'), ('for', 'O'), ('future', 'O'), ('research', 'O'), ('in', 'O'), ('low-', 'O'), ('and', 'O'), ('middle-income', 'O'), ('countries', 'O'), ('(', 'O'), ('LMICs', 'O'), (')', 'O'), ('.', 'O')]]


In [9]:
# Getting test sentences in a list
test_sentences = get_test_sentences(test_data)
print("First 2 sentences in a word list format:\n",test_sentences[0:2])

First 2 sentences in a word list format:
 [['CCCVA', ',', 'MANOVA', ',', 'my', 'black', 'hen', '.'], ['Comments', 'on', 'repeated', 'measures', '.']]


## Feature Extraction for DL Model


In [10]:
# Converting words to indices for test sentences (Features)
# Converting greek characters to ASCII characters in train set eg. 'naïve café' to 'naive cafe'
X = [[word2idx[unicodedata.normalize('NFKD', str(w[0])).
encode('ascii','ignore')] for w in s] for s in sentences]
 
# Converting words to indices for test sentences (Features)
# Converting greek characters to ASCII characters in test-set eg. 'naïve café' to 'naive cafe'
X_test = [[word2idx[unicodedata.normalize('NFKD', str(w)).
encode('ascii','ignore')] for w in s] for s in test_sentences]
 
'''
Padding train and test sentences to 180 words.
Sentences of length greater than 180 words are truncated.
Sentences of length less than 180 words are padded with a high value.
'''
X = pad_sequences(maxlen=MAXLEN, sequences=X, padding="post", value=n_words - 1)
X_test = pad_sequences(maxlen=MAXLEN, sequences=X_test, padding="post", value=n_words - 1)
 
# Converting tags to indices for test sentences (labels)
y = [[tag2idx[w[1]] for w in s] for s in sentences]
# Padding tag labels to 180 words.
y = pad_sequences(maxlen=MAXLEN, sequences=y, padding="post", value=tag2idx["O"])
 
# Making labels in one hot encoded form for DL model
y = [to_categorical(i, num_classes=n_tags) for i in y]

## Building Bidirectional LSTM Model

In [11]:
# 180 dimensional word indices as input
input = Input(shape=(MAXLEN,))
 
# Embedding layer of same length output (180 dim embedding will be generated)
model = Embedding(input_dim=n_words, output_dim=MAXLEN, input_length=MAXLEN)(input)
 
# Adding dropout layer
model = Dropout(0.2)(model)
 
# Bidirectional LSTM to learn from both forward as well as backward context
model = Bidirectional(LSTM(units=LSTM_N, return_sequences=True, recurrent_dropout=0.1))(model)
 
# Adding a TimeDistributedDense, to applying a Dense layer on each 180 timesteps
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model) # softmax output layer
model = Model(input, out)
 
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
history = model.fit(X, np.array(y), batch_size=BS, epochs=2, validation_split=0.05, verbose=1)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Train on 181717 samples, validate on 9565 samples
Epoch 1/2


  9312/181717 [>.............................] - ETA: 5:36:05 - loss: 1.1254 - acc: 0.03 - ETA: 3:42:49 - loss: 1.0717 - acc: 0.51 - ETA: 3:04:53 - loss: 1.0188 - acc: 0.67 - ETA: 2:45:25 - loss: 0.9628 - acc: 0.75 - ETA: 2:33:57 - loss: 0.8998 - acc: 0.80 - ETA: 2:26:52 - loss: 0.8323 - acc: 0.83 - ETA: 2:21:09 - loss: 0.7558 - acc: 0.85 - ETA: 2:17:05 - loss: 0.6755 - acc: 0.87 - ETA: 2:13:50 - loss: 0.6042 - acc: 0.88 - ETA: 2:11:33 - loss: 0.5450 - acc: 0.90 - ETA: 2:09:53 - loss: 0.4980 - acc: 0.90 - ETA: 2:08:03 - loss: 0.4576 - acc: 0.91 - ETA: 2:06:46 - loss: 0.4246 - acc: 0.92 - ETA: 2:05:25 - loss: 0.3952 - acc: 0.92 - ETA: 2:04:48 - loss: 0.3715 - acc: 0.93 - ETA: 2:04:17 - loss: 0.3493 - acc: 0.93 - ETA: 2:03:31 - loss: 0.3305 - acc: 0.94 - ETA: 2:02:27 - loss: 0.3132 - acc: 0.94 - ETA: 2:01:44 - loss: 0.2983 - acc: 0.94 - ETA: 2:01:15 - loss: 0.2846 - acc: 0.94 - ETA: 2:00:48 - loss: 0.2721 - acc: 0.95 - ETA: 2:00:54 - loss: 0.2609 - acc: 0.95 - ETA: 2:00:30 - loss: 0.2522

 18624/181717 [==>...........................] - ETA: 2:02:12 - loss: 0.0443 - acc: 0.99 - ETA: 2:02:15 - loss: 0.0441 - acc: 0.99 - ETA: 2:02:12 - loss: 0.0439 - acc: 0.99 - ETA: 2:02:10 - loss: 0.0438 - acc: 0.99 - ETA: 2:02:10 - loss: 0.0436 - acc: 0.99 - ETA: 2:02:12 - loss: 0.0435 - acc: 0.99 - ETA: 2:02:16 - loss: 0.0433 - acc: 0.99 - ETA: 2:02:13 - loss: 0.0432 - acc: 0.99 - ETA: 2:02:15 - loss: 0.0430 - acc: 0.99 - ETA: 2:02:17 - loss: 0.0429 - acc: 0.99 - ETA: 2:02:17 - loss: 0.0427 - acc: 0.99 - ETA: 2:02:13 - loss: 0.0425 - acc: 0.99 - ETA: 2:02:13 - loss: 0.0424 - acc: 0.99 - ETA: 2:02:19 - loss: 0.0422 - acc: 0.99 - ETA: 2:02:21 - loss: 0.0420 - acc: 0.99 - ETA: 2:02:19 - loss: 0.0419 - acc: 0.99 - ETA: 2:02:23 - loss: 0.0418 - acc: 0.99 - ETA: 2:02:26 - loss: 0.0417 - acc: 0.99 - ETA: 2:02:31 - loss: 0.0415 - acc: 0.99 - ETA: 2:02:35 - loss: 0.0414 - acc: 0.99 - ETA: 2:02:37 - loss: 0.0413 - acc: 0.99 - ETA: 2:02:33 - loss: 0.0412 - acc: 0.99 - ETA: 2:02:31 - loss: 0.0411

 27936/181717 [===>..........................] - ETA: 2:01:26 - loss: 0.0277 - acc: 0.99 - ETA: 2:01:24 - loss: 0.0277 - acc: 0.99 - ETA: 2:01:22 - loss: 0.0276 - acc: 0.99 - ETA: 2:01:18 - loss: 0.0276 - acc: 0.99 - ETA: 2:01:14 - loss: 0.0275 - acc: 0.99 - ETA: 2:01:13 - loss: 0.0275 - acc: 0.99 - ETA: 2:01:12 - loss: 0.0275 - acc: 0.99 - ETA: 2:01:09 - loss: 0.0274 - acc: 0.99 - ETA: 2:01:07 - loss: 0.0274 - acc: 0.99 - ETA: 2:01:06 - loss: 0.0273 - acc: 0.99 - ETA: 2:01:05 - loss: 0.0273 - acc: 0.99 - ETA: 2:01:05 - loss: 0.0272 - acc: 0.99 - ETA: 2:01:05 - loss: 0.0272 - acc: 0.99 - ETA: 2:01:04 - loss: 0.0271 - acc: 0.99 - ETA: 2:01:04 - loss: 0.0271 - acc: 0.99 - ETA: 2:01:04 - loss: 0.0270 - acc: 0.99 - ETA: 2:01:00 - loss: 0.0270 - acc: 0.99 - ETA: 2:00:58 - loss: 0.0270 - acc: 0.99 - ETA: 2:00:58 - loss: 0.0269 - acc: 0.99 - ETA: 2:00:59 - loss: 0.0269 - acc: 0.99 - ETA: 2:00:59 - loss: 0.0268 - acc: 0.99 - ETA: 2:00:57 - loss: 0.0268 - acc: 0.99 - ETA: 2:00:54 - loss: 0.0268

 37248/181717 [=====>........................] - ETA: 1:53:20 - loss: 0.0214 - acc: 0.99 - ETA: 1:53:17 - loss: 0.0214 - acc: 0.99 - ETA: 1:53:15 - loss: 0.0214 - acc: 0.99 - ETA: 1:53:13 - loss: 0.0213 - acc: 0.99 - ETA: 1:53:10 - loss: 0.0213 - acc: 0.99 - ETA: 1:53:08 - loss: 0.0213 - acc: 0.99 - ETA: 1:53:05 - loss: 0.0213 - acc: 0.99 - ETA: 1:53:04 - loss: 0.0212 - acc: 0.99 - ETA: 1:53:02 - loss: 0.0212 - acc: 0.99 - ETA: 1:53:00 - loss: 0.0212 - acc: 0.99 - ETA: 1:52:57 - loss: 0.0212 - acc: 0.99 - ETA: 1:52:55 - loss: 0.0211 - acc: 0.99 - ETA: 1:52:52 - loss: 0.0211 - acc: 0.99 - ETA: 1:52:50 - loss: 0.0211 - acc: 0.99 - ETA: 1:52:47 - loss: 0.0211 - acc: 0.99 - ETA: 1:52:45 - loss: 0.0211 - acc: 0.99 - ETA: 1:52:42 - loss: 0.0210 - acc: 0.99 - ETA: 1:52:40 - loss: 0.0210 - acc: 0.99 - ETA: 1:52:37 - loss: 0.0210 - acc: 0.99 - ETA: 1:52:35 - loss: 0.0210 - acc: 0.99 - ETA: 1:52:32 - loss: 0.0209 - acc: 0.99 - ETA: 1:52:29 - loss: 0.0209 - acc: 0.99 - ETA: 1:52:26 - loss: 0.0209































Epoch 2/2


  9312/181717 [>.............................] - ETA: 2:15:31 - loss: 0.0022 - acc: 0.99 - ETA: 2:11:55 - loss: 0.0036 - acc: 0.99 - ETA: 2:09:36 - loss: 0.0036 - acc: 0.99 - ETA: 2:08:25 - loss: 0.0032 - acc: 0.99 - ETA: 2:07:48 - loss: 0.0030 - acc: 0.99 - ETA: 2:07:47 - loss: 0.0029 - acc: 0.99 - ETA: 2:07:15 - loss: 0.0028 - acc: 0.99 - ETA: 2:07:55 - loss: 0.0028 - acc: 0.99 - ETA: 2:08:33 - loss: 0.0026 - acc: 0.99 - ETA: 2:10:17 - loss: 0.0025 - acc: 0.99 - ETA: 2:10:49 - loss: 0.0024 - acc: 0.99 - ETA: 2:10:37 - loss: 0.0023 - acc: 0.99 - ETA: 2:11:57 - loss: 0.0023 - acc: 0.99 - ETA: 2:12:20 - loss: 0.0022 - acc: 0.99 - ETA: 2:13:00 - loss: 0.0023 - acc: 0.99 - ETA: 2:13:29 - loss: 0.0023 - acc: 0.99 - ETA: 2:14:54 - loss: 0.0023 - acc: 0.99 - ETA: 2:14:22 - loss: 0.0022 - acc: 0.99 - ETA: 2:13:54 - loss: 0.0023 - acc: 0.99 - ETA: 2:13:26 - loss: 0.0023 - acc: 0.99 - ETA: 2:12:56 - loss: 0.0022 - acc: 0.99 - ETA: 2:12:43 - loss: 0.0023 - acc: 0.99 - ETA: 2:12:35 - loss: 0.0023

 18624/181717 [==>...........................] - ETA: 2:02:52 - loss: 0.0028 - acc: 0.99 - ETA: 2:02:49 - loss: 0.0028 - acc: 0.99 - ETA: 2:02:46 - loss: 0.0028 - acc: 0.99 - ETA: 2:02:44 - loss: 0.0028 - acc: 0.99 - ETA: 2:02:40 - loss: 0.0028 - acc: 0.99 - ETA: 2:02:36 - loss: 0.0028 - acc: 0.99 - ETA: 2:02:34 - loss: 0.0028 - acc: 0.99 - ETA: 2:02:35 - loss: 0.0028 - acc: 0.99 - ETA: 2:02:38 - loss: 0.0028 - acc: 0.99 - ETA: 2:02:39 - loss: 0.0028 - acc: 0.99 - ETA: 2:02:37 - loss: 0.0028 - acc: 0.99 - ETA: 2:02:34 - loss: 0.0028 - acc: 0.99 - ETA: 2:02:32 - loss: 0.0028 - acc: 0.99 - ETA: 2:02:28 - loss: 0.0028 - acc: 0.99 - ETA: 2:02:25 - loss: 0.0028 - acc: 0.99 - ETA: 2:02:22 - loss: 0.0028 - acc: 0.99 - ETA: 2:02:20 - loss: 0.0028 - acc: 0.99 - ETA: 2:02:18 - loss: 0.0028 - acc: 0.99 - ETA: 2:02:15 - loss: 0.0028 - acc: 0.99 - ETA: 2:02:13 - loss: 0.0028 - acc: 0.99 - ETA: 2:02:10 - loss: 0.0028 - acc: 0.99 - ETA: 2:02:07 - loss: 0.0028 - acc: 0.99 - ETA: 2:02:05 - loss: 0.0028

 27936/181717 [===>..........................] - ETA: 1:57:34 - loss: 0.0029 - acc: 0.99 - ETA: 1:57:32 - loss: 0.0029 - acc: 0.99 - ETA: 1:57:29 - loss: 0.0029 - acc: 0.99 - ETA: 1:57:28 - loss: 0.0029 - acc: 0.99 - ETA: 1:57:27 - loss: 0.0029 - acc: 0.99 - ETA: 1:57:25 - loss: 0.0029 - acc: 0.99 - ETA: 1:57:24 - loss: 0.0029 - acc: 0.99 - ETA: 1:57:26 - loss: 0.0029 - acc: 0.99 - ETA: 1:57:26 - loss: 0.0029 - acc: 0.99 - ETA: 1:57:23 - loss: 0.0029 - acc: 0.99 - ETA: 1:57:21 - loss: 0.0029 - acc: 0.99 - ETA: 1:57:19 - loss: 0.0029 - acc: 0.99 - ETA: 1:57:16 - loss: 0.0029 - acc: 0.99 - ETA: 1:57:14 - loss: 0.0029 - acc: 0.99 - ETA: 1:57:11 - loss: 0.0029 - acc: 0.99 - ETA: 1:57:09 - loss: 0.0029 - acc: 0.99 - ETA: 1:57:08 - loss: 0.0029 - acc: 0.99 - ETA: 1:57:07 - loss: 0.0029 - acc: 0.99 - ETA: 1:57:06 - loss: 0.0029 - acc: 0.99 - ETA: 1:57:07 - loss: 0.0029 - acc: 0.99 - ETA: 1:57:07 - loss: 0.0029 - acc: 0.99 - ETA: 1:57:07 - loss: 0.0029 - acc: 0.99 - ETA: 1:57:07 - loss: 0.0029

 37248/181717 [=====>........................] - ETA: 1:51:52 - loss: 0.0029 - acc: 0.99 - ETA: 1:51:51 - loss: 0.0029 - acc: 0.99 - ETA: 1:51:50 - loss: 0.0029 - acc: 0.99 - ETA: 1:51:47 - loss: 0.0029 - acc: 0.99 - ETA: 1:51:45 - loss: 0.0029 - acc: 0.99 - ETA: 1:51:43 - loss: 0.0029 - acc: 0.99 - ETA: 1:51:41 - loss: 0.0029 - acc: 0.99 - ETA: 1:51:38 - loss: 0.0029 - acc: 0.99 - ETA: 1:51:36 - loss: 0.0029 - acc: 0.99 - ETA: 1:51:34 - loss: 0.0029 - acc: 0.99 - ETA: 1:51:31 - loss: 0.0029 - acc: 0.99 - ETA: 1:51:29 - loss: 0.0029 - acc: 0.99 - ETA: 1:51:26 - loss: 0.0029 - acc: 0.99 - ETA: 1:51:24 - loss: 0.0029 - acc: 0.99 - ETA: 1:51:22 - loss: 0.0029 - acc: 0.99 - ETA: 1:51:19 - loss: 0.0029 - acc: 0.99 - ETA: 1:51:16 - loss: 0.0029 - acc: 0.99 - ETA: 1:51:14 - loss: 0.0029 - acc: 0.99 - ETA: 1:51:11 - loss: 0.0029 - acc: 0.99 - ETA: 1:51:10 - loss: 0.0029 - acc: 0.99 - ETA: 1:51:08 - loss: 0.0029 - acc: 0.99 - ETA: 1:51:05 - loss: 0.0029 - acc: 0.99 - ETA: 1:51:03 - loss: 0.0029

































In [12]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 180)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 180, 180)          46296540  
_________________________________________________________________
dropout_1 (Dropout)          (None, 180, 180)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 180, 300)          397200    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 180, 3)            903       
Total params: 46,694,643
Trainable params: 46,694,643
Non-trainable params: 0
_________________________________________________________________


## Prediction on Test Set

In [13]:
# Predicting on trained model
pred = model.predict(X_test)
print("Predicted Probabilities on Test Set:\n",pred.shape)
# taking tag class with maximum probability
pred_index = np.argmax(pred, axis=-1)
print("Predicted tag indices: \n",pred_index.shape)

Predicted Probabilities on Test Set:
 (125840, 180, 3)
Predicted tag indices: 
 (125840, 180)


In [14]:
# Flatten both the features and predicted tags for submission
ids,tagids = X_test.flatten().tolist(), pred_index.flatten().tolist()
 
# converting each word indices back to words
words_test = [words[ind].decode('utf-8') for ind in ids]
# converting each predicted tag indices back to tags
tags_test = [tags[ind] for ind in tagids]
print("Length of words in Padded test set:",len(words_test))
print("Length of tags in Padded test set:",len(tags_test))
print("\nCheck few of words and predicted tags:\n",words_test[:10],tags_test[:10])

Length of words in Padded test set: 22651200
Length of tags in Padded test set: 22651200

Check few of words and predicted tags:
 ['CCCVA', ',', 'MANOVA', ',', 'my', 'black', 'hen', '.', 'ENDPAD', 'ENDPAD'] ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


## Prepare Submission Data

In [15]:
'''
The task here is to convert padded fixed 180 dimensional predicted tags
to variable length test set sentences.
1. If the sentences have word length shorter than 180,
   then predcited tags are skipped.
2. If the sentences have word length longer than 180,
   then all extra words are tagged with "O" tag class.
'''
 
i=0
j=1
predicted_tags = []
counts = test_data.groupby('Sent_ID')['id'].count().tolist()
 
for index,count in enumerate(counts):
    if count <= MAXLEN:
        predicted_tags.append(tags_test[i:i+count])
    else:
        predicted_tags.append(tags_test[i:i+MAXLEN])
        out = ['O']*(count-MAXLEN)
        predicted_tags.append(out)
 
    i=j*MAXLEN
    j=j+1
 
predictions_final = [item for sublist in predicted_tags for item in sublist]
print("\nLength of test set words and predicted tags should match.")
print("Length of predicted tags:",len(predictions_final))
print("Length of words in test set:",test_data['Word'].size)


Length of test set words and predicted tags should match.
Length of predicted tags: 2994463
Length of words in test set: 2994463


## Writing the Submission File

In [16]:
df = pd.read_csv("sample_submission.csv", encoding="latin1")
# Creating a dataframe in the submission format
df_results = pd.DataFrame({'id':df['id'],'Sent_ID':df['Sent_ID'],'tag':predictions_final})
# writing csv submission file
df_results.to_csv('submission_final.csv',sep=",", index=None)
df_results.head()

Unnamed: 0,id,Sent_ID,tag
0,4543834,191283,O
1,4543835,191283,O
2,4543836,191283,O
3,4543837,191283,O
4,4543838,191283,O
