## Import libraries

In [1]:
import numpy as np 
from keras.models import Model
from keras.layers import TimeDistributed,Conv1D,Dense,Embedding,Input,Dropout,LSTM,Bidirectional,MaxPooling1D,Flatten,concatenate, Activation
from tensorflow.keras.utils import Progbar
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import RandomUniform
import numpy as np
import random
from keras.preprocessing.sequence import pad_sequences

### Change the number of epochs to 70 according to the paper

In [2]:
epochs = 70
glove_pre = "/datasets/glove.6B.100d.txt"

## Preprocess functions

In [3]:
def readfile(filename):
    '''
    read file
    return format :
    [ ['EU', 'B-ORG'], ['rejects', 'O'], ['German', 'B-MISC'], ['call', 'O'], ['to', 'O'], ['boycott', 'O'], ['British', 'B-MISC'], ['lamb', 'O'], ['.', 'O'] ]
    '''
    f = open(filename)
    sentences = []
    sentence = []
    for line in f:
        if len(line)==0 or line.startswith('-DOCSTART') or line[0]=="\n":
            if len(sentence) > 0:
                sentences.append(sentence)
                sentence = []
            continue
        splits = line.split(' ')
        sentence.append([splits[0],splits[-1]])

    if len(sentence) >0:
        sentences.append(sentence)
        sentence = []

    return sentences

def getCasing(word, caseLookup):   
    casing = 'other'
    
    numDigits = 0
    for char in word:
        if char.isdigit():
            numDigits += 1
            
    digitFraction = numDigits / float(len(word))
    
    if word.isdigit(): #Is a digit
        casing = 'numeric'
    elif digitFraction > 0.5:
        casing = 'mainly_numeric'
    elif word.islower(): #All lower case
        casing = 'allLower'
    elif word.isupper(): #All upper case
        casing = 'allUpper'
    elif word[0].isupper(): #is a title, initial char upper, then all lower
        casing = 'initialUpper'
    elif numDigits > 0:
        casing = 'contains_digit'

    return caseLookup[casing]
    
def createBatches(data):
    l = []
    for i in data:
        l.append(len(i[0]))
    l = set(l)
    batches = []
    batch_len = []
    z = 0
    for i in l:
        for batch in data:
            if len(batch[0]) == i:
                batches.append(batch)
                z += 1
        batch_len.append(z)
        
    return batches,batch_len

def createBatches(data):
    l = []
    for i in data:
        l.append(len(i[0]))
    l = set(l)
    batches = []
    batch_len = []
    z = 0
    for i in l:
        for batch in data:
            if len(batch[0]) == i:
                batches.append(batch)
                z += 1
        batch_len.append(z)
    return batches,batch_len

def createMatrices(sentences, word2Idx, label2Idx, case2Idx,char2Idx):
    unknownIdx = word2Idx['UNKNOWN_TOKEN']
    paddingIdx = word2Idx['PADDING_TOKEN']    
        
    dataset = []
    
    wordCount = 0
    unknownWordCount = 0
    
    for sentence in sentences:
        wordIndices = []    
        caseIndices = []
        charIndices = []
        labelIndices = []
        
        for word,char,label in sentence:  
            wordCount += 1
            if word in word2Idx:
                wordIdx = word2Idx[word]
            elif word.lower() in word2Idx:
                wordIdx = word2Idx[word.lower()]                 
            else:
                wordIdx = unknownIdx
                unknownWordCount += 1
            charIdx = []
            for x in char:
                try:
                    charIdx.append(char2Idx[x])
                except:
                    charIdx.append(char2Idx['UNKNOWN'])
            #Get the label and map to int            
            wordIndices.append(wordIdx)
            caseIndices.append(getCasing(word, case2Idx))
            charIndices.append(charIdx)
            labelIndices.append(label2Idx[label])
           
        dataset.append([wordIndices, caseIndices, charIndices, labelIndices]) 
        
    return dataset

def iterate_minibatches(dataset,batch_len): 
    start = 0
    for i in batch_len:
        tokens = []
        caseing = []
        char = []
        labels = []
        data = dataset[start:i]
        start = i
        for dt in data:
            t,c,ch,l = dt
            l = np.expand_dims(l,-1)
            tokens.append(t)
            caseing.append(c)
            char.append(ch)
            labels.append(l)
        yield np.asarray(labels),np.asarray(tokens),np.asarray(caseing),np.asarray(char)

def addCharInformatioin(Sentences):
    for i,sentence in enumerate(Sentences):
        for j,data in enumerate(sentence):
            chars = [c for c in data[0]]
            Sentences[i][j] = [data[0],chars,data[1]]
    return Sentences

def padding(Sentences):
    maxlen = 52
    for sentence in Sentences:
        char = sentence[2]
        for x in char:
            maxlen = max(maxlen,len(x))
    for i,sentence in enumerate(Sentences):
        Sentences[i][2] = pad_sequences(Sentences[i][2],52,padding='post')
    return Sentences

In [4]:
!mkdir datasets
!wget -P /datasets/ "https://raw.githubusercontent.com/kamalkraj/Named-Entity-Recognition-with-Bidirectional-LSTM-CNNs/master/data/train.txt"
!wget -P /datasets/ "https://raw.githubusercontent.com/kamalkraj/Named-Entity-Recognition-with-Bidirectional-LSTM-CNNs/master/data/valid.txt"
!wget -P /datasets/ "https://raw.githubusercontent.com/kamalkraj/Named-Entity-Recognition-with-Bidirectional-LSTM-CNNs/master/data/test.txt"

--2021-09-20 20:41:36--  https://raw.githubusercontent.com/kamalkraj/Named-Entity-Recognition-with-Bidirectional-LSTM-CNNs/master/data/train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3283420 (3.1M) [text/plain]
Saving to: ‘/datasets/train.txt’


2021-09-20 20:41:36 (67.1 MB/s) - ‘/datasets/train.txt’ saved [3283420/3283420]

--2021-09-20 20:41:36--  https://raw.githubusercontent.com/kamalkraj/Named-Entity-Recognition-with-Bidirectional-LSTM-CNNs/master/data/valid.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 827443

In [4]:
trainSentences = readfile("/datasets/train.txt")
devSentences = readfile("/datasets/valid.txt")
testSentences = readfile("/datasets/test.txt")

trainSentences = addCharInformatioin(trainSentences)
devSentences = addCharInformatioin(devSentences)
testSentences = addCharInformatioin(testSentences)
trainSentences[0]

[['EU', ['E', 'U'], 'B-ORG\n'],
 ['rejects', ['r', 'e', 'j', 'e', 'c', 't', 's'], 'O\n'],
 ['German', ['G', 'e', 'r', 'm', 'a', 'n'], 'B-MISC\n'],
 ['call', ['c', 'a', 'l', 'l'], 'O\n'],
 ['to', ['t', 'o'], 'O\n'],
 ['boycott', ['b', 'o', 'y', 'c', 'o', 't', 't'], 'O\n'],
 ['British', ['B', 'r', 'i', 't', 'i', 's', 'h'], 'B-MISC\n'],
 ['lamb', ['l', 'a', 'm', 'b'], 'O\n'],
 ['.', ['.'], 'O\n']]

In [6]:
!wget -P /datasets/ "nlp.stanford.edu/data/glove.6B.zip"
!unzip /datasets/glove.6B.zip -d /datasets/

--2021-09-20 20:41:39--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-09-20 20:41:39--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-09-20 20:41:39--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘/datasets/glove.6B.zip

In [5]:
labelSet = set()
words = {}

for dataset in [trainSentences, devSentences, testSentences]:
    for sentence in dataset:
        for token,char,label in sentence:
            labelSet.add(label)
            words[token.lower()] = True

# :: Create a mapping for the labels ::
label2Idx = {}
for label in labelSet:
    label2Idx[label] = len(label2Idx)

# :: Hard coded case lookup ::
case2Idx = {'numeric': 0, 'allLower':1, 'allUpper':2, 'initialUpper':3, 'other':4, 'mainly_numeric':5, 'contains_digit': 6, 'PADDING_TOKEN':7}
caseEmbeddings = np.identity(len(case2Idx), dtype='float32')

# :: Read in word embeddings ::
word2Idx = {}
wordEmbeddings = []

fEmbeddings = open(glove_pre, encoding="utf-8")

for line in fEmbeddings:
    split = line.strip().split(" ")
    word = split[0]
    
    if len(word2Idx) == 0: #Add padding+unknown
        word2Idx["PADDING_TOKEN"] = len(word2Idx)
        vector = np.zeros(len(split)-1) #Zero vector vor 'PADDING' word
        wordEmbeddings.append(vector)
        
        word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
        vector = np.random.uniform(-0.25, 0.25, len(split)-1)
        wordEmbeddings.append(vector)

    if split[0].lower() in words:
        vector = np.array([float(num) for num in split[1:]])
        wordEmbeddings.append(vector)
        word2Idx[split[0]] = len(word2Idx)
        
wordEmbeddings = np.array(wordEmbeddings)

char2Idx = {"PADDING":0, "UNKNOWN":1}
for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|":
    char2Idx[c] = len(char2Idx)

In [6]:
list(char2Idx.items())[:5]

[('PADDING', 0), ('UNKNOWN', 1), (' ', 2), ('0', 3), ('1', 4)]

In [7]:
train_set = padding(createMatrices(trainSentences,word2Idx,  label2Idx, case2Idx,char2Idx))
dev_set = padding(createMatrices(devSentences,word2Idx, label2Idx, case2Idx,char2Idx))
test_set = padding(createMatrices(testSentences, word2Idx, label2Idx, case2Idx,char2Idx))

idx2Label = {v: k for k, v in label2Idx.items()}

train_batch,train_batch_len = createBatches(train_set)
dev_batch,dev_batch_len = createBatches(dev_set)
test_batch,test_batch_len = createBatches(test_set)

In [8]:
print(word2Idx)
print(char2Idx)
print(case2Idx)

{'PADDING': 0, 'UNKNOWN': 1, ' ': 2, '0': 3, '1': 4, '2': 5, '3': 6, '4': 7, '5': 8, '6': 9, '7': 10, '8': 11, '9': 12, 'a': 13, 'b': 14, 'c': 15, 'd': 16, 'e': 17, 'f': 18, 'g': 19, 'h': 20, 'i': 21, 'j': 22, 'k': 23, 'l': 24, 'm': 25, 'n': 26, 'o': 27, 'p': 28, 'q': 29, 'r': 30, 's': 31, 't': 32, 'u': 33, 'v': 34, 'w': 35, 'x': 36, 'y': 37, 'z': 38, 'A': 39, 'B': 40, 'C': 41, 'D': 42, 'E': 43, 'F': 44, 'G': 45, 'H': 46, 'I': 47, 'J': 48, 'K': 49, 'L': 50, 'M': 51, 'N': 52, 'O': 53, 'P': 54, 'Q': 55, 'R': 56, 'S': 57, 'T': 58, 'U': 59, 'V': 60, 'W': 61, 'X': 62, 'Y': 63, 'Z': 64, '.': 65, ',': 66, '-': 67, '_': 68, '(': 69, ')': 70, '[': 71, ']': 72, '{': 73, '}': 74, '!': 75, '?': 76, ':': 77, ';': 78, '#': 79, "'": 80, '"': 81, '/': 82, '\\': 83, '%': 84, '$': 85, '`': 86, '&': 87, '=': 88, '*': 89, '+': 90, '@': 91, '^': 92, '~': 93, '|': 94}
{'numeric': 0, 'allLower': 1, 'allUpper': 2, 'initialUpper': 3, 'other': 4, 'mainly_numeric': 5, 'contains_digit': 6, 'PADDING_TOKEN': 7}


## Build and Train the model

In [9]:
words_input = Input(shape=(None,),dtype='int32',name='words_input')
words = Embedding(input_dim=wordEmbeddings.shape[0], output_dim=wordEmbeddings.shape[1],  weights=[wordEmbeddings], trainable=False)(words_input)

casing_input = Input(shape=(None,), dtype='int32', name='casing_input')
casing = Embedding(output_dim=caseEmbeddings.shape[1], input_dim=caseEmbeddings.shape[0], weights=[caseEmbeddings], trainable=False)(casing_input)

character_input=Input(shape=(None,52,),name='char_input')

embed_char_out=TimeDistributed(Embedding(len(char2Idx),30,embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)), name='char_embedding')(character_input)

dropout= Dropout(0.5)(embed_char_out)
conv1d_out= TimeDistributed(Conv1D(kernel_size=3, filters=30, padding='same',activation='tanh', strides=1))(dropout)
maxpool_out=TimeDistributed(MaxPooling1D(52))(conv1d_out)

char = TimeDistributed(Flatten())(maxpool_out)
char = Dropout(0.5)(char)

output = concatenate([words, casing,char])
output = Bidirectional(LSTM(200, return_sequences=True, dropout=0.50, recurrent_dropout=0.25))(output)

# output = TimeDistributed(Dense(len(label2Idx), activation='softmax'))(output)
output = TimeDistributed(Dense(len(label2Idx)), name='before_softmax')(output)
output = Activation('softmax')(output)
model = Model(inputs=[words_input, casing_input,character_input], outputs=[output])
model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam')
model.summary()

# plot_model(model, to_file='model.png')
layer_name = 'before_softmax'
intermediate_layer_model = Model(inputs=model.input,
                                 outputs=model.get_layer(layer_name).output)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input (InputLayer)         [(None, None, 52)]   0                                            
__________________________________________________________________________________________________
char_embedding (TimeDistributed (None, None, 52, 30) 2850        char_input[0][0]                 
__________________________________________________________________________________________________
dropout (Dropout)               (None, None, 52, 30) 0           char_embedding[0][0]             
__________________________________________________________________________________________________
time_distributed (TimeDistribut (None, None, 52, 30) 2730        dropout[0][0]                    
______________________________________________________________________________________________

### Train the model 
Don't run these 3 sections, it will take about 1 hour to train the model.

In [None]:
for epoch in range(epochs):    
    print("Epoch %d/%d"%(epoch,epochs))
    a = Progbar(len(train_batch_len))
    for i,batch in enumerate(iterate_minibatches(train_batch,train_batch_len)):
        labels, tokens, casing,char = batch       
        model.train_on_batch([tokens, casing,char], labels)
        a.update(i)
    a.update(i+1)
    print(' ')

Epoch 0/70
 
Epoch 1/70
 
Epoch 2/70
 
Epoch 3/70
 
Epoch 4/70
 
Epoch 5/70
 
Epoch 6/70
 
Epoch 7/70
 
Epoch 8/70
 
Epoch 9/70
 
Epoch 10/70
 
Epoch 11/70
 
Epoch 12/70
 
Epoch 13/70
 
Epoch 14/70
 
Epoch 15/70
 
Epoch 16/70
 
Epoch 17/70
 
Epoch 18/70
 
Epoch 19/70
 
Epoch 20/70
 
Epoch 21/70
 
Epoch 22/70
 
Epoch 23/70
 
Epoch 24/70
 
Epoch 25/70
 
Epoch 26/70
 
Epoch 27/70
 
Epoch 28/70
 
Epoch 29/70
 
Epoch 30/70
 
Epoch 31/70
 
Epoch 32/70
 
Epoch 33/70
 
Epoch 34/70
 
Epoch 35/70
 
Epoch 36/70
 
Epoch 37/70
 
Epoch 38/70
 
Epoch 39/70
 
Epoch 40/70
 
Epoch 41/70
 
Epoch 42/70
 
Epoch 43/70
 
Epoch 44/70
 
Epoch 45/70
 
Epoch 46/70
 
Epoch 47/70
 
Epoch 48/70
 
Epoch 49/70
 
Epoch 50/70
 
Epoch 51/70
 
Epoch 52/70
 
Epoch 53/70
 
Epoch 54/70
 
Epoch 55/70
 
Epoch 56/70
 
Epoch 57/70
 
Epoch 58/70
 
Epoch 59/70
 
Epoch 60/70
 
Epoch 61/70
 
Epoch 62/70
 
Epoch 63/70
 
Epoch 64/70
 
Epoch 65/70
 
Epoch 66/70
 
Epoch 67/70
 
Epoch 68/70
 
Epoch 69/70
 


In [None]:
model_json = model.to_json()
with open("NER.json", "w") as json_file:
    json_file.write(model_json)
    
# serialize weights to HDF5
model.save_weights("NER.h5")
print("Saved model to disk")

Saved model to disk


### Load the model weights

In [10]:
from keras.models import model_from_json
json_file = open('NER.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)

# load weights into new model
model.load_weights("NER.h5")
print("Loaded model from disk")

Loaded model from disk


## Make prediction

In [11]:
def tag_dataset(dataset):
    correctLabels = []
    predLabels = []
    before_softmaxs=[]
    b = Progbar(len(dataset),verbose=0)
    for i,data in enumerate(dataset):    
        tokens, casing,char, labels = data
        tokens = np.asarray([tokens])     
        casing = np.asarray([casing])
        char = np.asarray([char])
        pred = model.predict([tokens, casing,char], verbose=False)[0]
        before_softmax = intermediate_layer_model.predict([tokens, casing,char])[0]
        pred = pred.argmax(axis=-1) #Predict the classes            
        correctLabels.append(labels)
        predLabels.append(pred)
        before_softmaxs.append(before_softmax)
        b.update(i)
    b.update(i+1)
    return predLabels, correctLabels, before_softmaxs

In [12]:
def get_NER_embedding(l):
  new_list = []
  for s in l:
    to_add = []
    for word in s:
      #print("word:",word)
      to_add.append([word,"O\n"])
    new_list.append(to_add)
  new_list = addCharInformatioin(new_list)
  emb_set = padding(createMatrices(new_list,word2Idx,  label2Idx, case2Idx,char2Idx))
  emb_batch,_ = createBatches(emb_set)
  _, _, before = tag_dataset(emb_batch)
  return before

In [13]:
import nltk
nltk.download('punkt')

import re
from nltk.tokenize import word_tokenize, sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate
drive = None
def authenticate():
  global drive
  
  auth.authenticate_user()
  gauth = GoogleAuth()
  gauth.credentials = GoogleCredentials.get_application_default()
  drive = GoogleDrive(gauth)

#Download files
def downloadFiles(fileIds):
  authenticate()
  
  for fileId in fileIds:    
    
    downloaded = drive.CreateFile({"id": fileId[1]})
    downloaded.GetContentFile(fileId[0])

In [17]:
#Download file if not existing
try:
  _ = open("RNN-for-Joint-NLU-master/README.md", "r")
except:
  downloadFiles([["RNN-for-Joint-NLU-master.zip", "1S3Ojq4WOa3kDTgZaSWi-ASymYaqq-spU"]])
  !unzip "RNN-for-Joint-NLU-master.zip" 

Archive:  RNN-for-Joint-NLU-master.zip
36e7cb62d66b6eb6957b1090a054e8ddc756d6fc
   creating: RNN-for-Joint-NLU-master/
  inflating: RNN-for-Joint-NLU-master/.gitignore  
  inflating: RNN-for-Joint-NLU-master/README.md  
  inflating: RNN-for-Joint-NLU-master/data.py  
   creating: RNN-for-Joint-NLU-master/dataset/
  inflating: RNN-for-Joint-NLU-master/dataset/atis-2.dev.iob  
  inflating: RNN-for-Joint-NLU-master/dataset/atis-2.dev.w-intent.iob  
  inflating: RNN-for-Joint-NLU-master/dataset/atis-2.train.iob  
  inflating: RNN-for-Joint-NLU-master/dataset/atis-2.train.w-intent.iob  
  inflating: RNN-for-Joint-NLU-master/dataset/atis.test.iob  
  inflating: RNN-for-Joint-NLU-master/dataset/atis.test.w-intent.iob  
  inflating: RNN-for-Joint-NLU-master/dataset/atis.train.iob  
  inflating: RNN-for-Joint-NLU-master/dataset/atis.train.w-intent.iob  
  inflating: RNN-for-Joint-NLU-master/dataset/sample.iob  
  inflating: RNN-for-Joint-NLU-master/main.py  
  inflating: RNN-for-Joint-NLU-maste

In [18]:
# Download file if not existing
try:
  _ = open("RNN-for-Joint-NLU-master/dataset/conda/conda.dev.iob", "r")
except:
  downloadFiles([["conda_low_dataset2.zip", "1iUNzJBB0MF6eN0ySGQCJNHnk7kb_uiqk"]])
  !unzip "conda_low_dataset2.zip" -d "RNN-for-Joint-NLU-master/dataset/"

Archive:  conda_low_dataset2.zip
  inflating: RNN-for-Joint-NLU-master/dataset/conda/conda.dev.iob  
  inflating: RNN-for-Joint-NLU-master/dataset/conda/conda.dev.w-intent.iob  
  inflating: RNN-for-Joint-NLU-master/dataset/conda/conda.train.iob  
  inflating: RNN-for-Joint-NLU-master/dataset/conda/conda.train.w-intent.iob  
  inflating: RNN-for-Joint-NLU-master/dataset/low/low.dev.iob  
  inflating: RNN-for-Joint-NLU-master/dataset/low/low.dev.w-intent.iob  
  inflating: RNN-for-Joint-NLU-master/dataset/low/low.test.iob  
  inflating: RNN-for-Joint-NLU-master/dataset/low/low.test.w-intent.iob  
  inflating: RNN-for-Joint-NLU-master/dataset/low/low.train.iob  
  inflating: RNN-for-Joint-NLU-master/dataset/low/low.train.w-intent.iob  


In [14]:
def data_pipeline(data):
    data = [t[:-1] for t in data] 
    data = [[t.split("\t")[0].split(" "), t.split("\t")[1].split(" ")[:-1], t.split("\t")[1].split(" ")[-1]] for t in
            data]
    data = [[t[0][1:-1], t[1][:], t[2]] for t in data] 
    seq_in, seq_out, intent = list(zip(*data))

    return seq_in, seq_out, intent

CONDA

In [20]:
train_data =  open("./RNN-for-Joint-NLU-master/dataset/conda/conda.train.w-intent.iob", "r").readlines()
dev_data = open("./RNN-for-Joint-NLU-master/dataset/conda/conda.dev.w-intent.iob", "r").readlines()
test_data = dev_data
print("#"*100)
print("Training data: "+str(len(train_data)))
print("Development data: "+str(len(dev_data)))
print("Testing data: "+str(len(test_data)))
print("#"*100)

train_seq, train_slot, train_intent = data_pipeline(train_data)
dev_seq, dev_slot, dev_intent = data_pipeline(dev_data)
test_seq, test_slot, test_intent = data_pipeline(test_data)

print("#"*100)
print("Data example:")
print("Sequence: ")
print(train_seq[0])
print()
print("Intent: ")
print(train_intent[0])
print()
print("Slots: ")
print(train_slot[0])
print("#"*100)

####################################################################################################
Training data: 26078
Development data: 8705
Testing data: 8705
####################################################################################################
####################################################################################################
Data example:
Sequence: 
['wow']

Intent: 
O

Slots: 
['O']
####################################################################################################


LOL

In [15]:
train_data =  open("./RNN-for-Joint-NLU-master/dataset/low/low.train.w-intent.iob", "r").readlines()
dev_data=open("./RNN-for-Joint-NLU-master/dataset/low/low.dev.w-intent.iob", "r").readlines()
test_data = open("./RNN-for-Joint-NLU-master/dataset/low/low.test.w-intent.iob", "r").readlines()
print("#"*100)
print("Training data: "+str(len(train_data)))
print("Development data: "+str(len(dev_data)))
print("Testing data: "+str(len(test_data)))
print("#"*100)

train_seq, train_slot, train_intent = data_pipeline(train_data)
dev_seq, dev_slot, dev_intent = data_pipeline(dev_data)
test_seq, test_slot, test_intent = data_pipeline(test_data)

print("#"*100)
print("Data example:")
print("Sequence: ")
print(train_seq[0])
print()
print("Intent: ")
print(train_intent[0])
print()
print("Slots: ")
print(train_slot[0])
print("#"*100)

####################################################################################################
Training data: 29358
Development data: 3258
Testing data: 3628
####################################################################################################
####################################################################################################
Data example:
Sequence: 
['nothinh', 'come', ':D']

Intent: 
O

Slots: 
['O', 'O', 'O']
####################################################################################################


In [16]:
# remove ''
for seq in train_seq:
  while '' in seq:
    seq.remove('')

for seq in dev_seq:
  while '' in seq:
    seq.remove('')

for seq in test_seq:
  while '' in seq:
    seq.remove('')
train_seq[:5]

(['nothinh', 'come', ':D'],
 ['malph,', 'would', 'you', 'gank', 'bot?'],
 ['no', 'flame', 'just', 'rage'],
 ['i', 'do'],
 ['no', 'mana', ':/'])

Tokenizer for J-BERT

In [17]:
# install
!pip install pytorch-pretrained-bert pytorch-nlp

# BERT imports
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertModel
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

# specify GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)



'Tesla T4'

In [18]:
train_seq[:10]

(['nothinh', 'come', ':D'],
 ['malph,', 'would', 'you', 'gank', 'bot?'],
 ['no', 'flame', 'just', 'rage'],
 ['i', 'do'],
 ['no', 'mana', ':/'],
 ['like', 'soraka'],
 ['np'],
 ['Its', 'my', 'fault', 'really...'],
 ['coming'],
 ['ok'])

In [31]:
# Tokenize with BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

sentences = []
#for query in train_seq:
#for query in dev_seq:
for query in test_seq:
  tokens = []
  tokens.append('[CLS]')
  for word in query:
      word_tokens = tokenizer.tokenize(word)
      if not word_tokens:
          word_tokens = '[UNK]'  # For handling the bad-encoded word
      tokens.extend(word_tokens)
  tokens.append('[SEP]')
  sentences.append(tokens)
print(sentences)

[['[CLS]', 'guys', 'together', '[SEP]'], ['[CLS]', 'how', 'dare', 'we', 'go', '0', '-', '2', 'feeder', '##s', 'bronze', 'kids', '[SEP]'], ['[CLS]', 'g', '##j', 're', '##tar', '##d', '[SEP]'], ['[CLS]', 'yu', '##p', 'ign', '##ite', 'to', 'ks', 'no', '##ob', 'act', '[SEP]'], ['[CLS]', 'no', 'flash', 'th', '##resh', '[SEP]'], ['[CLS]', 'pu', '##ss', 'y', 'e', '##z', '[SEP]'], ['[CLS]', 'w', '##p', 'rep', '##rot', 'sha', '##co', '[SEP]'], ['[CLS]', '4', 'bot', 'can', "'", 't', 'do', 'sh', '##t', '[SEP]'], ['[CLS]', 'where', 'is', 'bu', '##l', '##wark', '?', '[SEP]'], ['[CLS]', 'they', 'get', '1', 'kill', '[SEP]'], ['[CLS]', 'im', 'a', 'no', '##ob', '.', '.', '.', '[SEP]'], ['[CLS]', 'not', 'lying', 'just', 'bt', '##w', '[SEP]'], ['[CLS]', 'how', 'is', 'she', 'l', '##v', '##l', '6', '?', '[SEP]'], ['[CLS]', 'kay', '##le', 'thinks', 'we', 'are', 'all', 'no', '##ob', '##s', 'only', 'he', 'plays', 'like', 'a', 'pro', '[SEP]'], ['[CLS]', 'reg', '##roup', '[SEP]'], ['[CLS]', 'ye', '##a', 'do', '

Tokenizer for Transformer models

In [32]:
#sentences = ["I like you yes yes no no know I like you","I know"]
#sentences = [[token for token in sent] for sent in train_seq]
#sentences = [[token for token in sent] for sent in dev_seq]
#sentences = [[token for token in sent] for sent in test_seq]
#print(sentences[:20])

In [33]:
# check the maximum length of the training and testing tokens
sent_length = [len(sent) for sent in sentences]
print(max(sent_length))
print(min(sent_length))

76
3


In [34]:
# change to your target dataset's max length
MAX_LEN = 50 # 18 for conda and 28 for lol transformers models

### Get the embeddings before softmax

In [35]:
import time

start = time.time()
NER_embedding_list = []
for sent in sentences:
  #print(sent)
  this_array = get_NER_embedding([sent])[0].tolist()
  if(len(sent)>MAX_LEN):
    this_array = this_array[:MAX_LEN]
  if(len(sent)<MAX_LEN):
    for i in range(MAX_LEN-len(sent)):
      this_array.append([0]*9)
  NER_embedding_list.append(this_array)
end = time.time()
print("Time taken is: {:.2f} min ".format((end-start)/60))

Time taken is: 5.33 min 


In [36]:
NER_embedding = np.array(NER_embedding_list)
NER_embedding.shape

(3628, 50, 9)

### Save the embeddings

In [37]:
#np.save('CONDA_train_NER_embedding.npy', NER_embedding)
#np.save('CONDA_dev_NER_embedding.npy', NER_embedding)
#np.save('LOL_train_NER_embedding.npy', NER_embedding)
#np.save('LOL_dev_NER_embedding.npy', NER_embedding)
np.save('LOL_test_NER_embedding.npy', NER_embedding)