In [9]:
!nvidia-smi

Sun Jan 24 05:54:44 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8     9W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [1]:
# Importing Essential Libraries
import pandas as pd 
import numpy as np 
from itertools import chain

from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.optimizers import Optimizer

import tensorflow 
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

In [2]:
data= pd.read_csv('/content/drive/MyDrive/Named Entity Recognition/ner_dataset.csv', encoding='unicode_escape')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [3]:
# Extracting the mappings that are required to train the neural network
def get_dict_map(data, token_or_tag):
    tok2idx= {}
    idx2tok= {}
    
    if token_or_tag == 'token':
        vocab= list(set(data['Word'].to_list()))
    else:
        vocab= list(set(data['Tag'].to_list()))
    
    idx2tok= {idx:tok for idx, tok in enumerate(vocab)}
    tok2idx= {tok:idx for idx, tok in enumerate(vocab)}
    
    return tok2idx, idx2tok

token2idx, idx2token= get_dict_map(data, 'token')
tag2idx, idx2tag= get_dict_map(data, 'tag')

In [4]:
# Transforming the columns to extract the sequential data

data['Word_idx']= data["Word"].map(token2idx)
data['Tag_idx']= data['Tag'].map(tag2idx)

data_fillna= data.fillna(method='ffill', axis=0)

# Groupby and collect columns
data_group= data_fillna.groupby(["Sentence #"], as_index=False)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x:list(x))

  if __name__ == '__main__':


In [5]:
def get_pad_train_test(data_group, data):
    # get max token and tag length
    n_token= len(list(set(data["Word"].to_list())))
    n_tag= len(list(set(data["Tag"].to_list())))
    
    # Pad tokens (X var)
    tokens= data_group['Word_idx'].to_list()
    maxlen= max([len(s) for s in tokens])
    pad_tokens= pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value=n_token-1)
    
    # Pad Tags (y var) and convert it to one hot encoding
    tags= data_group['Tag_idx'].to_list()
    pad_tags= pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value=tag2idx["O"])
    n_tags= len(tag2idx)
    pad_tags= [to_categorical(i, num_classes=n_tags) for i in pad_tags]
    
    # Splitting into train, test and validation sets
    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)
    
    print(
        'train_tokens length:', len(train_tokens),
        '\ntrain_tokens length:', len(train_tokens),
        '\ntest_tokens length:', len(test_tokens),
        '\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(val_tokens),
        '\nval_tags:', len(val_tags),
    )
    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags

train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = get_pad_train_test(data_group, data)

train_tokens length: 32372 
train_tokens length: 32372 
test_tokens length: 4796 
test_tags: 4796 
val_tokens: 10791 
val_tags: 10791


In [6]:
# The layer below will take the dimensions from the LSTM layer and will give the maximum length and maximum tags as output
input_dim= len(list(set(data['Word'].to_list())))+1
output_dim= 64
input_length= max([len(s) for s in data_group['Word_idx'].to_list()])
n_tags= len(tag2idx)

In [7]:
# Creating a function taht will give summary of each layer

def get_bilstm_lstm_model():
    model = Sequential()

    # Add Embedding layer
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

    # Add LSTM
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

    # Add timeDistributed Layer
    model.add(TimeDistributed(Dense(n_tags, activation="relu")))

    #Optimiser 
    # adam = Optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model

In [8]:
# Function to train the model

def train_model(X, y, model):
    loss = list()
    for i in range(25):
        # fit model for one epoch on this sequence
        hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [10]:
# Main driver function
results = pd.DataFrame()
model_bilstm_lstm = get_bilstm_lstm_model()
plot_model(model_bilstm_lstm)
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 104, 64)           2251456   
_________________________________________________________________
bidirectional (Bidirectional (None, 104, 128)          66048     
_________________________________________________________________
lstm_1 (LSTM)                (None, 104, 64)           49408     
_________________________________________________________________
time_distributed (TimeDistri (None, 104, 17)           1105      
Total params: 2,368,017
Trainable params: 2,368,017
Non-trainable params: 0
_________________________________________________________________


In [18]:
import spacy
import en_core_web_sm
from spacy import displacy 
nlp= spacy.load('en_core_web_sm')
text= nlp('Hi my name is Vaibhav Verma. \n I am a Junior Data Scientist. \n I work as an intern at Ineuron. ')
displacy.render(text, style='ent', jupyter=True)