In [1]:
import numpy as np 
import pandas as pd 
import keras
print(keras.__version__)

from math import nan
from keras.callbacks import ModelCheckpoint

from keras_contrib.layers import CRF



Using TensorFlow backend.


2.3.1


In [2]:
dframe = pd.read_csv("ner_small.csv", encoding = "ISO-8859-1", error_bad_lines=False)

In [3]:
dataset=dframe.drop(['Unnamed: 0', 'lemma', 'next-lemma', 'next-next-lemma', 'next-next-pos',
       'next-next-shape', 'next-next-word', 'next-pos', 'next-shape',
       'next-word', 'prev-iob', 'prev-lemma', 'prev-pos',
       'prev-prev-iob', 'prev-prev-lemma', 'prev-prev-pos', 'prev-prev-shape',
       'prev-prev-word', 'prev-shape', 'prev-word',"pos"],axis=1)

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15591 entries, 0 to 15590
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   sentence_idx  15591 non-null  int64 
 1   shape         15591 non-null  object
 2   word          15591 non-null  object
 3   tag           15591 non-null  object
dtypes: int64(1), object(3)
memory usage: 487.3+ KB


In [5]:
dataset.head()

Unnamed: 0,sentence_idx,shape,word,tag
0,1,capitalized,Thousands,O
1,1,lowercase,of,O
2,1,lowercase,demonstrators,O
3,1,lowercase,have,O
4,1,lowercase,marched,O


In [6]:
dataset=dataset.drop(['shape'],axis=1)

In [7]:
dataset.head()

Unnamed: 0,sentence_idx,word,tag
0,1,Thousands,O
1,1,of,O
2,1,demonstrators,O
3,1,have,O
4,1,marched,O


In [8]:
class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda s: [(w, t) for w,t in zip(s["word"].values.tolist(),
                                                        s["tag"].values.tolist())]
        self.grouped = self.dataset.groupby("sentence_idx").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [9]:
getter = SentenceGetter(dataset)

In [10]:
sentences = getter.sentences

In [11]:
print(sentences[5])

[('The', 'O'), ('party', 'O'), ('is', 'O'), ('divided', 'O'), ('over', 'O'), ('Britain', 'B-gpe'), ("'s", 'O'), ('participation', 'O'), ('in', 'O'), ('the', 'O'), ('Iraq', 'B-geo'), ('conflict', 'O'), ('and', 'O'), ('the', 'O'), ('continued', 'O'), ('deployment', 'O'), ('of', 'O'), ('8,500', 'O'), ('British', 'B-gpe'), ('troops', 'O'), ('in', 'O'), ('that', 'O'), ('country', 'O'), ('.', 'O')]


In [12]:
maxlen = max([len(s) for s in sentences])
print ('Maximum sequence length:', maxlen)

Maximum sequence length: 62


In [13]:
words = list(set(dataset["word"].values))
words.append("ENDPAD")

In [14]:
n_words = len(words); n_words

3691

In [15]:
tags = []
for tag in set(dataset["tag"].values):
    if tag is nan or isinstance(tag, float):
        tags.append('unk')
    else:
        tags.append(tag)
print(tags)

['I-tim', 'I-eve', 'B-per', 'B-geo', 'B-org', 'B-tim', 'I-geo', 'O', 'I-org', 'I-per', 'I-nat', 'I-art', 'B-nat', 'I-gpe', 'B-art', 'B-gpe', 'B-eve']


In [16]:
n_tags = len(tags); n_tags

17

In [17]:
from future.utils import iteritems
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}
idx2tag = {v: k for k, v in iteritems(tag2idx)}

In [18]:
from keras.preprocessing.sequence import pad_sequences
X = [[word2idx[w[0]] for w in s] for s in sentences]

In [19]:
np.array(X).shape


(702,)

In [20]:
X = pad_sequences(maxlen=140, sequences=X, padding="post",value=n_words - 1)

In [21]:
y_idx = [[tag2idx[w[1]] for w in s] for s in sentences]
print(sentences[100])
print(y_idx[100])

[('The', 'O'), ('Pakistani', 'B-gpe'), ('military', 'O'), ('launched', 'O'), ('its', 'O'), ('offensive', 'O'), ('in', 'O'), ('Orakzai', 'B-geo'), ('to', 'O'), ('hunt', 'O'), ('Taliban', 'B-org'), ('insurgents', 'O'), ('.', 'O')]
[7, 15, 7, 7, 7, 7, 7, 3, 7, 7, 4, 7, 7]


In [22]:
y = pad_sequences(maxlen=140, sequences=y_idx, padding="post", value=tag2idx["O"])
print(y_idx[100])

[7, 15, 7, 7, 7, 7, 7, 3, 7, 7, 4, 7, 7]


In [23]:
from keras.utils import to_categorical
y = [to_categorical(i, num_classes=n_tags) for i in y]

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [25]:
X_train.shape

(561, 140)

In [26]:
X_train[0]

array([3058, 1201, 1811, 2013, 3213, 2521, 1626, 3550, 1651, 3104, 2339,
        749,  660, 3078, 3550, 2521, 1596, 1180, 3524, 2521, 2506, 3550,
       1772, 3549, 3047, 3550, 2821, 3488, 3079, 2354, 3058, 1979, 3690,
       3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690,
       3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690,
       3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690,
       3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690,
       3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690,
       3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690,
       3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690,
       3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690,
       3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690,
       3690, 3690, 3690, 3690, 3690, 3690, 3690, 3690])

In [27]:
y_train[1],y_train[1].shape

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32),
 (140, 17))

In [28]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
import keras as k


In [29]:
input = Input(shape=(140,))
word_embedding_size = 300
model = Embedding(input_dim=n_words, output_dim=word_embedding_size, input_length=140)(input)
model = Bidirectional(LSTM(units=word_embedding_size, 
                           return_sequences=True, 
                           dropout=0.5, 
                           recurrent_dropout=0.5, 
                           kernel_initializer=k.initializers.he_normal()))(model)
model = LSTM(units=word_embedding_size * 2, 
             return_sequences=True, 
             dropout=0.5, 
             recurrent_dropout=0.5, 
             kernel_initializer=k.initializers.he_normal())(model)
model = TimeDistributed(Dense(n_tags, activation="relu"))(model)  # previously softmax output layer

crf = CRF(n_tags)  # CRF layer
out = crf(model)  # output


In [30]:
model = Model(input, out)


In [31]:
adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)
#model.compile(optimizer=adam, loss="categorical_crossentropy", metrics=["accuracy"])
model.compile(optimizer=adam, loss=crf.loss_function, metrics=[crf.accuracy, 'accuracy'])




In [32]:
history = model.fit(X_train, np.array(y_train), batch_size=256, epochs=3, validation_split=0.2, verbose=1)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 448 samples, validate on 113 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [33]:
p = model.predict(np.array([X_test[0]]))
p = np.argmax(p, axis=-1)
print(p)

[[7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
  7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
  7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
  7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7]]


In [34]:
gt = np.argmax(y_test[0], axis=-1)
print(gt)
print("{:14}: ({:5}): {}".format("Word", "True", "Pred"))
for idx, (w,pred) in enumerate(zip(X_test[0],p[0])):
    #
    print("{:14}: ({:5}): {}".format(words[w],idx2tag[gt[idx]],tags[pred]))


[ 7  7 15  7  7  7  7  7  3  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7]
Word          : (True ): Pred
However       : (O    ): O
,             : (O    ): O
U.S.          : (B-gpe): O
forces        : (O    ): O
have          : (O    ): O
yet           : (O    ): O
to            : (O    ): O
enter         : (O    ): O
Somalia       : (B-geo): O
,             : (O    ): O
which         : (O    ): O
has           : (O    ): O
been          : (O    ): O
without       : (O    ): O
a             : (O    ): O
functioning   : (O    ): O
central       : (O    ): O
government    : (O    ): O
for           : (O    ): O
more          : (O    ): O
tha