In [1]:
import pandas as pd
import numpy as np
from collections import Iterable
from nltk.tag import ClassifierBasedTagger
from nltk.chunk import ChunkParserI, conlltags2tree
from nltk.stem.snowball import SnowballStemmer
from nltk import pos_tag, word_tokenize
from spacy import displacy
# import keras.backend.tensorflow_backend as K
import keras.backend as K
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF
from keras_contrib import losses, metrics
from keras import backend



class EntityChunker(ChunkParserI):
    def __init__(self, train_sents, **kwargs):
        assert isinstance(train_sents, Iterable)

        self.feature_detector = features
        self.tagger = ClassifierBasedTagger(
            train=train_sents,
            feature_detector=features,
            **kwargs)

    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]
        
        return iob_triplets

    
def to_conll_iob(annotated_sentence):
    """
    `annotated_sentence` = list of triplets [(w1, t1, iob1), ...]
    Transform a pseudo-IOB notation: O, PERSON, PERSON, O, O, LOCATION, O
    to proper IOB notation: O, B-PERSON, I-PERSON, O, O, B-LOCATION, O
    """
    proper_iob_tokens = []
    for idx, annotated_token in enumerate(annotated_sentence):
        tag, word, ner = annotated_token

        if ner != 'O':
            if idx == 0:
                ner = "B-" + ner
            elif annotated_sentence[idx - 1][2] == ner:
                ner = "I-" + ner
            else:
                ner = "B-" + ner
        proper_iob_tokens.append((tag, word, ner))
    return proper_iob_tokens


def features(tokens, index, history):
    """
    'tokens'  = a POS-tagged sentence [(w1, t1), ...]
    'index'   = the index of the token we want to extract features for
    'history' = the previous predicted IOB tags
    """

    # init the stemmer
    stemmer = SnowballStemmer('english')

    # Pad the sequence with placeholders
    tokens = [('[START2]', '[START2]'), ('[START1]', '[START1]')] + list(tokens) + [('[END1]', '[END1]'),
                                                                                    ('[END2]', '[END2]')]
    history = ['[START2]', '[START1]'] + list(history)

    # shift the index with 2, to accommodate the padding
    index += 2

    word, pos = tokens[index]
    prevword, prevpos = tokens[index - 1]
    prevprevword, prevprevpos = tokens[index - 2]
    nextword, nextpos = tokens[index + 1]
    nextnextword, nextnextpos = tokens[index + 2]
    previob = history[index - 1]
    contains_dash = '-' in word
    contains_dot = '.' in word
    allascii = all([True for c in word if c in string.ascii_lowercase])

    allcaps = word == word.capitalize()
    capitalized = word[0] in string.ascii_uppercase

    prevallcaps = prevword == prevword.capitalize()
    prevcapitalized = prevword[0] in string.ascii_uppercase

    nextallcaps = prevword == prevword.capitalize()
    nextcapitalized = prevword[0] in string.ascii_uppercase

    return {
        'word': word,
        'lemma': stemmer.stem(word),
        'pos': pos,
        'all-ascii': allascii,

        'next-word': nextword,
        'next-lemma': stemmer.stem(nextword),
        'next-pos': nextpos,

        'next-next-word': nextnextword,
        'nextnextpos': nextnextpos,

        'prev-word': prevword,
        'prev-lemma': stemmer.stem(prevword),
        'prev-pos': prevpos,

        'prev-prev-word': prevprevword,
        'prev-prev-pos': prevprevpos,

        'prev-iob': previob,

        'contains-dash': contains_dash,
        'contains-dot': contains_dot,

        'all-caps': allcaps,
        'capitalized': capitalized,

        'prev-all-caps': prevallcaps,
        'prev-capitalized': prevcapitalized,

        'next-all-caps': nextallcaps,
        'next-capitalized': nextcapitalized,
    }

Using TensorFlow backend.


In [2]:
data_file_path = '/Users/TonY/Desktop/sfl/Dataset_01-29-2019.txt'
data = pd.read_csv(data_file_path, delimiter='\t',
                   header=0, index_col=0,
                   names=['Sentence', 'Word', 'POS', 'Tag'],
                   encoding='unicode_escape')

# Check datatype
data.dtypes
# Change float to int
data.Sentence = data.Sentence.astype(int)
# Check NA
data.isna().any()

Sentence    False
Word        False
POS         False
Tag         False
dtype: bool

In [43]:
sentence_arr = data.Sentence.unique()
token = []
for sentence in sentence_arr:
    temp = data[data['Sentence'] == sentence]
    curr_index = temp.index
    one_sentence = []
    for index in curr_index:
        one_sentence.append((temp.Word[index], temp.POS[index], temp.Tag[index]))
    token.append(one_sentence)
    
sentences = token


In [36]:
words = list(set(data["Word"].values))
words.append("ENDPAD")
n_words = len(words)
tags = list(set(data["Tag"].values))
n_tags = len(tags)

max_len = 75
word2idx = {w: i + 1 for i, w in enumerate(words)}
# print(word2idx)
tag2idx = {t: i for i, t in enumerate(tags)}
# print(tag2idx)

{'I-org': 0, 'I-eve': 1, 'I-per': 2, 'B-eve': 3, 'B-per': 4, 'B-gpe': 5, 'I-tim': 6, 'B-nat': 7, 'I-nat': 8, 'I-gpe': 9, 'B-geo': 10, 'B-art': 11, 'I-geo': 12, 'B-tim': 13, 'O': 14, 'I-art': 15, 'B-org': 16}


In [5]:
X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words-1)
X

array([[4818, 4742, 1671, ..., 8766, 8766, 8766],
       [3090, 4742, 4903, ..., 8766, 8766, 8766],
       [7732, 6988, 1776, ..., 8766, 8766, 8766],
       ...,
       [7562, 3348, 4458, ..., 8766, 8766, 8766],
       [5868, 6390, 1153, ..., 8766, 8766, 8766],
       [7562, 4165, 7086, ..., 8766, 8766, 8766]], dtype=int32)

In [44]:
y = [[tag2idx[w[2]] for w in s] for s in sentences]
# print(y)
y = [to_categorical(i, num_classes=n_tags) for i in y]

In [46]:
print(X_tr.shape)
print(X_tr)

(2699, 75)
[[3786 8707 7086 ... 8766 8766 8766]
 [7562 1910  519 ... 8766 8766 8766]
 [4395 2703 6121 ... 8766 8766 8766]
 ...
 [7973 7510 7847 ... 8766 8766 8766]
 [ 557 2327 2612 ... 8766 8766 8766]
 [2162  481 6106 ... 8766 8766 8766]]


In [42]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)

np.array(y_tr)

array([array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [19]:
input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words + 1, output_dim=20,
                  input_length=max_len, mask_zero=True)(input)  # 20-dim embedding
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model)  # variational biLSTM
model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
crf = CRF(n_tags)  # CRF layer
out = crf(model)  # output

In [20]:
model = Model(input, out)
model.compile(optimizer="rmsprop", loss=losses.crf_loss, metrics=[metrics.crf_accuracy])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 75)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 75, 20)            175360    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 75, 100)           28400     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 75, 50)            5050      
_________________________________________________________________
crf_1 (CRF)                  (None, 75, 17)            1190      
Total params: 210,000
Trainable params: 210,000
Non-trainable params: 0
_________________________________________________________________


In [28]:
history = model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=5, validation_split=0.1, verbose=1)

Train on 2429 samples, validate on 270 samples
Epoch 1/5


ValueError: setting an array element with a sequence.