#BiLSTM

Import pandas + numpy and load data

In [0]:
import pandas as pd
import numpy as np

data = pd.read_csv('Dataset.csv', encoding='latin1')[['Sentence #','Word','POS', 'Tag']]

In [0]:
%tensorflow_version 1.x

In [0]:
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(1)

Make a list of all words in the data

In [0]:
words = list(set(data["Word"].values))
words.append("ENDPAD")

In [0]:
n_words = len(words)

In [0]:
tags = list(set(data["Tag"].values))

In [0]:
n_tags = len(tags)

Sentence getter

In [0]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [0]:
getter = SentenceGetter(data)

In [0]:
sentences = getter.sentences

Enumerate and pad sentences

In [0]:
max_len = 75
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [0]:
from keras.preprocessing.sequence import pad_sequences
X = [[word2idx[w[0]] for w in s] for s in sentences]
y = [[tag2idx[w[2]] for w in s] for s in sentences]

In [0]:
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words - 1)
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

Transform to categorical

In [0]:
from keras.utils import to_categorical
y = [to_categorical(i, num_classes=n_tags) for i in y]

In [0]:
!pip install seqeval

In [0]:
idx2tag = {i: w for w, i in tag2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out
    


The model

In [0]:
import os
from sklearn import metrics
from scipy.stats import zscore
from sklearn.model_selection import StratifiedKFold
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
from nervaluate import Evaluator

# Cross-validation
skf = StratifiedKFold(5, shuffle=True, random_state=42)



resultslist = []    
oos_y = []
oos_pred = []
oos_x = []

fold = 0
for train, test in skf.split(X,[1 if 0 in [np.array(yyy).argmax() for yyy in yy] else 0 for yy in y]):
    fold+=1
    print(f"Fold #{fold}")
        
    x_train = X[train]
    y_train = np.array(y)[train]
    x_test = X[test]
    y_test = np.array(y)[test]
    
    input = Input(shape=(max_len,))
    model = Embedding(input_dim=n_words, output_dim=50, input_length=max_len)(input)
    model = Dropout(0.1)(model)
    model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
    out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  # softmax output layer
    model = Model(input, out)
    model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

    model.fit(x_train, np.array(y_train),verbose=0,batch_size=32,epochs=5)
    
    pred = model.predict(x_test)

    print(classification_report(pred2label(y_test), pred2label(pred)))
    evaluator = Evaluator(pred2label(y_test), pred2label(pred), tags= [''], loader='list')
    results, results_per_tag = evaluator.evaluate()
    resultslist.append(results)
    
    oos_y.append(y_test)
    oos_x.append(x_test)
    oos_pred.append(pred)    



# Build the oos prediction list 
oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
oos_x = np.concatenate(oos_x)


Results

In [0]:
# Calculate the SD scores of the B and I

#np.std(np.array(['Fill in the scores']))

In [0]:
# Calculate the SD score of partial and exact

np.std(np.array([r['partial']['precision'] for r in resultslist]))

In [0]:
pred_labels = pred2label(oos_pred)
test_labels = pred2label(oos_y)

In [0]:
print(classification_report(test_labels, pred_labels))

In [0]:
evaluator = Evaluator(test_labels, pred_labels, tags= [''], loader='list')
results, results_per_tag = evaluator.evaluate()

In [0]:
results