# Named Entity Recognition - LSTM

---

Created By: Xavier De Carvalho  
Created On: 12/08/2021  
Upated By: N/A  
Updated On: N/A  
Version: NER0.0.01

### Requirements

---

##### Data Set - `ner_dataset`     
[Get the data set from Kaggle](https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus)     

##### Essential Info About Tagged Items:
- geo = Geographical Entity
- org = Organization
- per = Person
- tim = Time Indicator
- art = Artifact
- eve = Event
- nat = Natural Phenomenon

##### Required Hardware     
- GPU     

##### Required Python Packages     
- Numpy
- Pandas
- ScikitLearn
    - Model_Selection
- Matplotlib     
    - PyPlot
- Tensorflow

### Install Dependencies If Needed

---

NOTE: This might not be required if you're running your notebook instance in the cloud! 
<br><br>
Delete the cell below if this is the case...

In [None]:
# Import the sys dependency
# import sys
# Install dependencies
# !{sys.executable} -m pip install numpy
# !{sys.executable} -m pip install matplotlib
# !{sys.executable} -m pip install pandas
# !{sys.executable} -m pip install sklearn
# !{sys.executable} -m pip install tensorflow
# !{sys.executable} -m pip install livelossplot

### Import Packages

---

In [None]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, SpatialDropout1D, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from livelossplot.tf_keras import PlotLossesCallback
from sklearn.model_selection import train_test_split
# Confirm packages have been imported
print("Packages imported!")

# Create random seed
np.random.seed(0)
print("Random seed created!")

# Set pyplot style
plt.style.use("ggplot")
print("Pyplot style selected!")

# Tensorflow details
print(
    f'''
    Tensorflow-
        Tensorflow version:     {tf.__version__}
        GPU detected:           {tf.config.list_physical_devices('GPU')}
    '''
)

### Import Dataset

---

In [None]:
# Read from CSV
data = pd.read_csv('ner_dataset.csv', encoding='latin1')
# Fill null values
data = data.fillna(method='ffill')
# Show first (n) values in the dataset
data.head(20)

In [None]:
# Show total number of unique words and tags in data set
print(
    f'''
    Totals-
        Unique words in corpus:     {data['Word'].nunique()}
        Unique tags in corpus:      {data['Tag'].nunique()}
    ''')

### Add Padding Token to Words

---

In [None]:
# Add a padding token to the end of the vocabulary
words = list(set(data["Word"].values))
words.append("ENDPAD")
# Get number of words in set
num_words = len(words)

# Create tags set
tags = list(set(data["Tag"].values))
# Get number of tags in set
num_tags = len(tags)

# Verify ENDPAD is appended to Words and that Tags has not changed
print(
    f'''
        Number of words:    {num_words}
        Number of tags:     {num_tags}
    '''
)

### Retrieve sentences and their corresponding tags

---

In [None]:
# Retrieve sentences and their corresponding tags
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        agg_func = lambda s: [
            (w, p, t) for w, p, t in zip(
                s["Word"].values.tolist(),
                s["POS"].values.tolist(),
                s["Tag"].values.tolist()
            )
        ]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [None]:
# Initialize getter
getter = SentenceGetter(data)
# Get Sentences
sentences = getter.sentences

In [None]:
# Show first sentence
sentences[0]

### Define mappings between Sentences and Tags

---

In [None]:
# Assign a unique index to each word
word2idx = {w: i+1 for i, w in enumerate(words)}
# Assign a unique index to each tag
tag2idx = {t: i for i, t in enumerate(tags)}

In [None]:
# Validate word and tag indexes
# word2idx, tag2idx

### Pad Input Sentences

---

In [None]:
# Visualize sentences
plt.hist([len(s) for s in sentences], bins=50)
plt.show()

In [None]:
# Set the max length
max_len = 50

# Create feature matrix
X = [[word2idx[w[0]] for w in s] for s in sentences] # Numerical representation of our words
X = pad_sequences(
    maxlen=max_len, 
    sequences=X, 
    padding='post', 
    value=num_words-1
)

# Create target vector
y = [[tag2idx[w[2]] for w in s] for s in sentences]
y = pad_sequences(
    maxlen=max_len,
    sequences=y,
    padding='post',
    value=tag2idx["O"]
)
y = [to_categorical(i, num_classes=num_tags) for i in y]

### Create Train/Test Splits

---

In [None]:
# Create training and test sets
x_train, x_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=1/10, 
    random_state=1
)

### Build Bidirectional LSTM Model

---

In [None]:
# Word embeddings
input_word = Input(shape=(max_len,))
model = Embedding(input_dim=num_words, output_dim=max_len, input_length=max_len)(input_word)

# Spatial dropout layer
model = SpatialDropout1D(0.1)(model)

# Bidirectional LSTM
model = Bidirectional(
    LSTM(
        units=100, 
        return_sequences=True,
        recurrent_dropout=0.1
    )
)(model)

# Apply dense layer to each time step
out = TimeDistributed(Dense(num_tags, activation='softmax'))(model)

# Combine layers
model = Model(input_word, out)

# Model Summary
model.summary()

### Compile the LSTM Model

---

In [None]:
# Compile model
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

### Train the model

---

In [None]:
# Set early stopping
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=1, # Increase this value with higher epochs
    verbose=0,
    mode='max',
    restore_best_weights=False
) # Stop early if the model is not improving with each new epoch

# Create callbacks list
callbacks = [PlotLossesCallback(), early_stopping, ] # PlotLossesCallback lets us view the model updates live in the notebook

# Start training the model
history = model.fit(
    x_train,
    np.array(y_train),
    validation_split=2/10,
    batch_size=32, # Can increase this when using more powerful GPUs
    epochs=3, # Avoid hardcoding this unless you need a fast output to test
    verbose=1,
    callbacks=callbacks
)

### Evaluate Model

---

In [None]:
# Evaluate model on unbiased dataset
model.evaluate(x_test, np.array(y_test))

In [None]:
# Create random data set
i = np.random.randint(0, x_test.shape[0]) # Random index to get values from test set
p = model.predict(np.array([x_test[i]])) # Model predictions in One-hot encoded matrix
p = np.argmax(p, axis=-1)

# Get True Values
y_true = np.argmax(np.array(y_test), axis=-1)[i]

# Create validation table |...words...|...prediction...|...actual...|
print(
    "{:15}{:5}\t {}\n".format("Word", "True", "Pred")
)
print("-"*30)
for w, true, pred in zip(x_test[i], y_true, p[0]):
    print("{:15}{:5}\t {}\n".format(words[w-1], tags[true], tags[pred]))