In [20]:
import os, sys, pickle, glob
from tqdm import tqdm

# Random librairies
import numpy as np
import pandas as pd

import keras
import tensorflow as tf

# Keras and tensorflow librairies
from keras.models import Model, load_model
from keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed, Reshape, Bidirectional, concatenate, Flatten, Layer
from keras_contrib.layers import CRF
from keras.utils import plot_model, pad_sequences, to_categorical
from keras import backend as K
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_accuracy

# Sklearn librairies
from sklearn.model_selection import train_test_split

In [5]:
print("Keras version: ", keras.__version__)
print("Tensorflow version: ", tf.__version__)

Keras version:  3.4.1
Tensorflow version:  2.16.1


In [22]:
def custom_slice(x, start, size):
    if K.backend() == 'tensorflow':
        return x[:, start[1]:start[1] + size[1]]
    else:
        raise ValueError("Unsupported backend")

# Patch the custom slice method into Keras backend
K.slice = custom_slice

class CustomCRF(CRF):
    def compute_mask(self, inputs, mask=None):
        if mask is not None and self.learn_mode == 'join':
            return tf.reduce_any(mask, axis=1)  # Use tf.reduce_any instead of K.any
        return mask

# Custom layer to wrap tf.reduce_any
class MaskAny(Layer):
    def call(self, inputs):
        return tf.reduce_any(inputs, axis=1, keepdims=True)

### **Read and Describe Data**

In [7]:
def read_dataset():
    data = pd.read_csv('./ner_dataset/ner_dataset.csv', encoding='latin1')

    # remove white spaces from column names
    data.columns = data.columns.str.strip()

    print(data.columns)
    # print(data.columns)
    # Group by 'Sentence #' and aggregate
    grouped_data = data.groupby('Sentence #').agg({
        'Word': lambda x: ' '.join(x),  # Join words into a single sentence
        'Tag': lambda x: list(x),       # Collect tags into a list
        'Intent': lambda x: x     # Collect intents into a list
    }).reset_index()  # Reset index to make 'Sentence #' a regular column

    return data, grouped_data


def prepare_data(dataframe):
    dataset = []
    for index, row in dataframe.iterrows():
        sentence = row['Word']
        tags = row['Tag']
        intents = row['Intent'][0]
        dataset.append((sentence, tags, intents))

    return dataset

data, grouped_data = read_dataset()

prepared_dataset = prepare_data(grouped_data)

print("Columns:", data.columns, grouped_data.columns)

data.head()
grouped_data.head()

Index(['Sentence #', 'Word', 'Tag', 'Intent'], dtype='object')
Columns: Index(['Sentence #', 'Word', 'Tag', 'Intent'], dtype='object') Index(['Sentence #', 'Word', 'Tag', 'Intent'], dtype='object')


Unnamed: 0,Sentence #,Word,Tag,Intent
0,0,is approved equals clustering algorithms,"[ B-VAR, I-VAR, O, B-VAL, I-VAL]","[ variable_declaration, variable_declaration,..."
1,1,file path equals 2023,"[ B-VAR, I-VAR, O, B-VAL]","[ variable_declaration, variable_declaration,..."
2,2,user id equals REGRESSION MODELS,"[ B-VAR, I-VAR, O, B-VAL, I-VAL]","[ variable_declaration, variable_declaration,..."
3,3,temp equals car bus train plane bicycle,"[ B-VAR, O, B-VAL, B-VAL, B-VAL, B-VAL, ...","[ variable_declaration, variable_declaration,..."
4,4,time elapsed equals -555555,"[ B-VAR, I-VAR, O, B-VAL]","[ variable_declaration, variable_declaration,..."


In [8]:
print("Number of unique words in the dataset:", len(data['Word'].unique()) )# number of unique words in the dataset
print("Number of unique tags in the dataset:", len(data['Tag'].unique())) # number of unique tags in the dataset
print("Number of unique intents in the dataset:", len(data['Intent'].unique())) #number of unique intents in the dataset

print("Unique tags in the dataset:", data['Tag'].unique())

Number of unique words in the dataset: 304
Number of unique tags in the dataset: 6
Number of unique intents in the dataset: 1
Unique tags in the dataset: [' B-VAR' ' I-VAR' ' O' ' B-VAL' ' I-VAL' ' B-TYPE']


In [9]:
#Getting unique words and labels from data
words = list(data['Word'].unique())
tags = list(data['Tag'].unique())
# Dictionary word:index pair
# word is key and its value is corresponding index
word_to_index = {word.strip() : i + 2 for i, word in enumerate(words)}
word_to_index["UNK"] = 1
word_to_index["PAD"] = 0

# Dictionary lable:index pair
# label is key and value is index.
tag_to_index = {tag.strip() : i + 1 for i, tag in enumerate(tags)}
tag_to_index["PAD"] = 0

idx2word = {i: word for word, i in word_to_index.items()}
idx2tag = {i: tag for tag, i in tag_to_index.items()}

In [10]:
print("The word deckare is identified by the index: {}".format(word_to_index["declare"]))
print("The label B-VAR for the variable is identified by the index: {}".format(tag_to_index["B-VAR"]))

The word deckare is identified by the index: 79
The label B-VAR for the variable is identified by the index: 1


In [11]:
# This is a class te get sentence. The each sentence will be list of tuples with its tag and pos.
class Sentence(object):
    def __init__(self, df):
        self.n_sent = 0
        self.df = df
        self.empty = False
        agg = lambda s : [(word.strip(), tag.strip(), intent.strip()) for word, tag, intent in zip(s['Word'].values.tolist(),
                                                       s['Tag'].values.tolist(),
                                                       s['Intent'].values.tolist())]
        self.grouped = self.df.groupby("Sentence #").apply(agg)
        self.sentences = [s for s in self.grouped]
        
    def get_text(self):
        try:
            s = self.grouped[self.n_sent]
            self.n_sent +=1
            return s
        except:
            return None
    
    def records_to_tuples(self):
        dataset = []

        grouped_data = data.groupby('Sentence #').agg({
        'Word': lambda x: ' '.join(x),  # Join words into a single sentence
        'Tag': lambda x: list(x.str.strip()),       # Collect tags into a list
        'Intent': lambda x: x.str.strip()     # Collect intents into a list
        }).reset_index()

        for _, row in grouped_data.iterrows():
            sentence = row['Word'][1:]
            tags = row['Tag']
            intents = row['Intent'][0]
            dataset.append((sentence, tags, intents))

        return dataset
        
#Displaying one full sentence
getter = Sentence(data)
sentences = [''.join([s[0] for s in sentence]) for sentence in getter.sentences]
sentences[0]

#sentence with its pos and tag.
sent = getter.get_text()
print(sent)

sentences = getter.sentences
print(sentences[0])

other_sentences = getter.records_to_tuples()

print(other_sentences[0])

[('is', 'B-VAR', 'variable_declaration'), ('approved', 'I-VAR', 'variable_declaration'), ('equals', 'O', 'variable_declaration'), ('clustering', 'B-VAL', 'variable_declaration'), ('algorithms', 'I-VAL', 'variable_declaration')]
[('is', 'B-VAR', 'variable_declaration'), ('approved', 'I-VAR', 'variable_declaration'), ('equals', 'O', 'variable_declaration'), ('clustering', 'B-VAL', 'variable_declaration'), ('algorithms', 'I-VAL', 'variable_declaration')]
('is  approved  equals  clustering  algorithms', ['B-VAR', 'I-VAR', 'O', 'B-VAL', 'I-VAL'], 'variable_declaration')


### **Data Preparation For Training**

In [28]:
# We should convert each sentence to integers
#Getting unique words and labels from data -> our vocab
words = list(data['Word'].unique())
tags = list(data['Tag'].unique())

# 1. Each word to integer
# word is key and its value is corresponding index
word_to_index = {word.strip() : i + 2 for i, word in enumerate(words)}
word_to_index['UNK'] = 1
word_to_index['PAD'] = 0

vocab_size = len(word_to_index)
print("Vocab size:", vocab_size)

# 2. Each label to integer
# label is key and value is index.
tag_to_index = {tag.strip() : i + 1 for i, tag in enumerate(tags)}
tag_to_index['PAD'] = 0

print("Tag to index:", tag_to_index)

# conver _ to index to index_ to word
index_to_word = dict(sorted({i: word for word, i in word_to_index.items()}.items(), key=lambda item: item[0]))
index_to_tag = dict(sorted({i: tag for tag, i in tag_to_index.items()}.items(), key=lambda item: item[0]))

print("Index to tag:", index_to_tag)

# 3. Each example to a list of integers
training_data = [([word_to_index.get(word, word_to_index['UNK']) for word in sentence.split()], 
                   [tag_to_index[tag] for tag in tags], 
                   intent.replace('_', ' ')) for sentence, tags, intent in other_sentences]

print("Training data size:", len(training_data))
print("Training data sample:", training_data[0:5])

# 4. Each sentence should be padded to have same length
padded_sentences = pad_sequences([sentence for sentence, _, _ in training_data], padding='post')
padded_tags = pad_sequences([tags for _, tags, _ in training_data], padding='post')

categorial_tags = [to_categorical(tags, num_classes=len(tag_to_index)) for tags in padded_tags]

print("Padded sentences shape:", padded_sentences.shape)
print("Padded tags shape:", padded_tags.shape)
print("Categorial tags shape:", np.array(categorial_tags).shape)

Vocab size: 306
Tag to index: {'B-VAR': 1, 'I-VAR': 2, 'O': 3, 'B-VAL': 4, 'I-VAL': 5, 'B-TYPE': 6, 'PAD': 0}
Index to tag: {0: 'PAD', 1: 'B-VAR', 2: 'I-VAR', 3: 'O', 4: 'B-VAL', 5: 'I-VAL', 6: 'B-TYPE'}
Training data size: 148
Training data sample: [([2, 3, 4, 5, 6], [1, 2, 3, 4, 5], 'variable declaration'), ([7, 8, 4, 9], [1, 2, 3, 4], 'variable declaration'), ([10, 11, 4, 12, 13], [1, 2, 3, 4, 5], 'variable declaration'), ([14, 4, 15, 16, 17, 18, 19], [1, 3, 4, 4, 4, 4, 4], 'variable declaration'), ([20, 21, 4, 22], [1, 2, 3, 4], 'variable declaration')]
Padded sentences shape: (148, 20)
Padded tags shape: (148, 20)
Categorial tags shape: (148, 20, 7)


In [26]:
x_train, x_test, y_train, y_test = train_test_split(padded_sentences, categorial_tags, test_size = 0.1)

print("Size of training input data: ", x_train.shape)
print("Size of training labels: ", np.array(y_train).shape)
print("Size of testing input data: ", x_test.shape)
print("Size of testing labels: ", np.array(y_test).shape)

Size of training input data:  (133, 20)
Size of training labels:  (133, 20, 7)
Size of testing input data:  (15, 20)
Size of testing labels:  (15, 20, 7)


### **Model Definition**

In [14]:
# Model Parameters
epochs = 10
max_sentence_length = max([len(sentence.split()) for sentence, _, _ in prepared_dataset])
print("Max sentence length:", max_sentence_length)
number_of_tags = len(tag_to_index) 
vocab_size = len(word_to_index)
word_embedding = 100
intent_embedding = 50

Max sentence length: 20


In [34]:
input = Input(shape=(max_sentence_length,)) # should be the embeding of the word along with the intent
embedding_layer = Embedding(input_dim=vocab_size, output_dim=word_embedding, mask_zero=True)(input) # word embedding layer
bilst_layer = Bidirectional(LSTM(units=50, return_sequences=True, recurrent_dropout=0.2))(embedding_layer) # BI-LSTM layer
dense_layer = TimeDistributed(Dense(units=number_of_tags ,activation='relu'))(bilst_layer)
crf = CustomCRF(number_of_tags)
output = crf(dense_layer)

final_model = Model(inputs=input ,outputs=output)

final_model.compile(optimizer='rmsprop', loss=crf_loss, metrics=[crf_accuracy, 'accuracy'])

final_model.summary()

In [35]:
final_model.fit(x_train, np.array(y_train), epochs=epochs, validation_split=0.1)

Epoch 1/10


OperatorNotAllowedInGraphError: Exception encountered when calling TimeDistributed.call().

[1mUsing a symbolic `tf.Tensor` as a Python `bool` is not allowed. You can attempt the following resolutions to the problem: If you are running in Graph mode, use Eager execution mode or decorate this function with @tf.function. If you are using AutoGraph, you can try decorating this function with @tf.function. If that does not work, then you may be using an unsupported feature or your source code may not be visible to AutoGraph. See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/autograph/g3doc/reference/limitations.md#access-to-source-code for more information.[0m

Arguments received by TimeDistributed.call():
  • inputs=tf.Tensor(shape=(None, 20, 100), dtype=float32)
  • training=True
  • mask=tf.Tensor(shape=(None, 20), dtype=bool)

: 