In [3]:
# IMPORTS
import pandas as pd
import numpy as np

import tensorflow as tf

import keras

# from keras.preprocessing.sequence import pad_sequences
# from keras.utils import to_categorical
# from keras.layers import LSTM, Dense, TimeDistributed, Embedding, Bidirectional
# from keras.models import Model
# from tensorflow.python.keras.layers import Input
# import tensorflow_addons as tfa

from tensorflow_addons.layers import CRF

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn_crfsuite.metrics import flat_classification_report
from sklearn.metrics import f1_score
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
from keras import preprocessing
import pickle

In [5]:
print(tf.__version__)
print(keras.__version__)

2.14.1
2.14.0


#### Dataset Preparation

In [6]:
def read_dataset():
    data = pd.read_csv('./ner_dataset/ner_dataset.csv', encoding='latin1')

    # remove white spaces from column names
    data.columns = data.columns.str.strip()

    print(data.columns)
    # print(data.columns)
    # Group by 'Sentence #' and aggregate
    grouped_data = data.groupby('Sentence #').agg({
        'Word': lambda x: ' '.join(x),  # Join words into a single sentence
        'Tag': lambda x: list(x),       # Collect tags into a list
        'Intent': lambda x: x     # Collect intents into a list
    }).reset_index()  # Reset index to make 'Sentence #' a regular column

    return data, grouped_data


def prepare_data(dataframe):
    dataset = []
    for index, row in dataframe.iterrows():
        sentence = row['Word']
        tags = row['Tag']
        intents = row['Intent'][0]
        dataset.append((sentence, tags, intents))

    return dataset

data, grouped_data = read_dataset()

prepared_dataset = prepare_data(grouped_data)

print("Columns:", data.columns, grouped_data.columns)

data.head()
grouped_data.head()

Index(['Sentence #', 'Word', 'Tag', 'Intent'], dtype='object')
Columns: Index(['Sentence #', 'Word', 'Tag', 'Intent'], dtype='object') Index(['Sentence #', 'Word', 'Tag', 'Intent'], dtype='object')


Unnamed: 0,Sentence #,Word,Tag,Intent
0,0,is approved equals clustering algorithms,"[ B-VAR, I-VAR, O, B-VAL, I-VAL]","[ variable_declaration, variable_declaration,..."
1,1,file path equals 2023,"[ B-VAR, I-VAR, O, B-VAL]","[ variable_declaration, variable_declaration,..."
2,2,user id equals REGRESSION MODELS,"[ B-VAR, I-VAR, O, B-VAL, I-VAL]","[ variable_declaration, variable_declaration,..."
3,3,temp equals car bus train plane bicycle,"[ B-VAR, O, B-VAL, B-VAL, B-VAL, B-VAL, ...","[ variable_declaration, variable_declaration,..."
4,4,time elapsed equals -555555,"[ B-VAR, I-VAR, O, B-VAL]","[ variable_declaration, variable_declaration,..."


In [7]:
print("Number of unique words in the dataset:", len(data['Word'].unique()) )# number of unique words in the dataset
print("Number of unique tags in the dataset:", len(data['Tag'].unique())) # number of unique tags in the dataset
print("Number of unique intents in the dataset:", len(data['Intent'].unique())) #number of unique intents in the dataset

print("Unique tags in the dataset:", data['Tag'].unique())

Number of unique words in the dataset: 304
Number of unique tags in the dataset: 6
Number of unique intents in the dataset: 1
Unique tags in the dataset: [' B-VAR' ' I-VAR' ' O' ' B-VAL' ' I-VAL' ' B-TYPE']


In [8]:
# check for missing values
data.isnull().sum() # no missing values

Sentence #    0
Word          0
Tag           0
Intent        0
dtype: int64

In [9]:

# Dictionary word:index pair

# Dictionary lable:index pair


In [10]:
print("The word declare is identified by the index: {}".format(word_to_index["declare"]))
print("The label B-VAR for the variable is identified by the index: {}".format(tag_to_index["B-VAR"]))

The word deckare is identified by the index: 79
The label B-VAR for the variable is identified by the index: 1


In [11]:
# This is a class te get sentence. The each sentence will be list of tuples with its tag and pos.
class Sentence(object):
    def __init__(self, df):
        self.n_sent = 0
        self.df = df
        self.empty = False
        agg = lambda s : [(word.strip(), tag.strip(), intent.strip()) for word, tag, intent in zip(s['Word'].values.tolist(),
                                                       s['Tag'].values.tolist(),
                                                       s['Intent'].values.tolist())]
        self.grouped = self.df.groupby("Sentence #").apply(agg)
        self.sentences = [s for s in self.grouped]
        
    def get_text(self):
        try:
            s = self.grouped[self.n_sent]
            self.n_sent +=1
            return s
        except:
            return None
    
    def records_to_tuples(self):
        dataset = []

        grouped_data = data.groupby('Sentence #').agg({
        'Word': lambda x: ' '.join(x),  # Join words into a single sentence
        'Tag': lambda x: list(x),       # Collect tags into a list
        'Intent': lambda x: x     # Collect intents into a list
        }).reset_index()

        for index, row in grouped_data.iterrows():
            sentence = row['Word']
            tags = row['Tag']
            intents = row['Intent'][0]
            dataset.append((sentence, tags, intents))

        return dataset
        
#Displaying one full sentence
getter = Sentence(data)
sentences = [''.join([s[0] for s in sentence]) for sentence in getter.sentences]
sentences[0]

#sentence with its pos and tag.
sent = getter.get_text()
print(sent)

[('is', 'B-VAR', 'variable_declaration'), ('approved', 'I-VAR', 'variable_declaration'), ('equals', 'O', 'variable_declaration'), ('clustering', 'B-VAL', 'variable_declaration'), ('algorithms', 'I-VAL', 'variable_declaration')]


In [12]:
sentences = getter.sentences
print(sentences[0])

[('is', 'B-VAR', 'variable_declaration'), ('approved', 'I-VAR', 'variable_declaration'), ('equals', 'O', 'variable_declaration'), ('clustering', 'B-VAL', 'variable_declaration'), ('algorithms', 'I-VAL', 'variable_declaration')]


In [13]:
other_sentences = getter.records_to_tuples()

print(other_sentences[0])

(' is  approved  equals  clustering  algorithms', [' B-VAR', ' I-VAR', ' O', ' B-VAL', ' I-VAL'], ' variable_declaration')


In [14]:
# Deine parameters for the model
batch_size = 32
epochs = 10
max_sentence_length = max([len(sentence.split()) for sentence, _, _ in prepared_dataset])
print("Max sentence length:", max_sentence_length)

word_embedding = 100
intent_embedding = 50

Max sentence length: 20


[[('is', 'B-VAR', 'variable_declaration'), ('approved', 'I-VAR', 'variable_declaration'), ('equals', 'O', 'variable_declaration'), ('clustering', 'B-VAL', 'variable_declaration'), ('algorithms', 'I-VAL', 'variable_declaration')], [('file', 'B-VAR', 'variable_declaration'), ('path', 'I-VAR', 'variable_declaration'), ('equals', 'O', 'variable_declaration'), ('2023', 'B-VAL', 'variable_declaration')], [('user', 'B-VAR', 'variable_declaration'), ('id', 'I-VAR', 'variable_declaration'), ('equals', 'O', 'variable_declaration'), ('REGRESSION', 'B-VAL', 'variable_declaration'), ('MODELS', 'I-VAL', 'variable_declaration')], [('temp', 'B-VAR', 'variable_declaration'), ('equals', 'O', 'variable_declaration'), ('car', 'B-VAL', 'variable_declaration'), ('bus', 'B-VAL', 'variable_declaration'), ('train', 'B-VAL', 'variable_declaration'), ('plane', 'B-VAL', 'variable_declaration'), ('bicycle', 'B-VAL', 'variable_declaration')], [('time', 'B-VAR', 'variable_declaration'), ('elapsed', 'I-VAR', 'variabl

In [16]:
# Convert label to index
y = [[tag_to_index[w[1]] for w in s] for s in sentences]

# padding
y = keras.utils.pad_sequences(maxlen = max_sentence_length, sequences = y, padding = "post", value = tag_to_index["PAD"])

In [17]:
num_tag = data['Tag'].nunique()
print("Number of tags:", num_tag)
# One hot encoded labels
y = [keras.utils.to_categorical(i, num_classes = num_tag + 1) for i in y]

Number of tags: 6


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15)

In [19]:
print("Size of training input data : ", X_train.shape)
print("Size of training output data : ", np.array(y_train).shape)
print("Size of testing input data : ", X_test.shape)
print("Size of testing output data : ", np.array(y_test).shape)

Size of training input data :  (125, 20)
Size of training output data :  (125, 20, 7)
Size of testing input data :  (23, 20)
Size of testing output data :  (23, 20, 7)


In [20]:
# Let's check the first sentence before and after processing.
print('*****Before Processing first sentence : *****\n', ' '.join([w[0] for w in sentences[0]]))
print('*****After Processing first sentence : *****\n ', X[0])

*****Before Processing first sentence : *****
 is approved equals clustering algorithms
*****After Processing first sentence : *****
  [2 3 4 5 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [21]:
# First label before and after processing.
print('*****Before Processing first sentence : *****\n', ' '.join([w[1] for w in sentences[0]]))
print('*****After Processing first sentence : *****\n ', y[0])

*****Before Processing first sentence : *****
 B-VAR I-VAR O B-VAL I-VAL
*****After Processing first sentence : *****
  [[0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]]


In [22]:

num_tags = data['Tag'].nunique()

# Define custom CRF loss function
def crf_loss(y_true, y_pred):
    crf_layer = tfa.layers.CRF(num_tags + 1)
    # Get log likelihood loss
    log_likelihood, _ = crf_layer(y_true, y_pred)
    return -log_likelihood

# Define custom accuracy metric for CRF
def crf_accuracy(y_true, y_pred):
    crf_layer = tfa.layers.CRF(num_tags + 1)
    # Predict tags
    pred_tags, _ = crf_layer.viterbi_decoding(y_pred)
    true_tags = tf.argmax(y_true, axis=-1)
    # Calculate accuracy excluding padding
    mask = tf.not_equal(true_tags, 0)
    matches = tf.equal(pred_tags, true_tags)
    matches = tf.logical_and(matches, mask)
    accuracy = tf.reduce_sum(tf.cast(matches, tf.float32)) / tf.reduce_sum(tf.cast(mask, tf.float32))
    return accuracy

In [29]:
# Model architecture
input = keras.layers.Input(shape = (max_sentence_length,))
model = keras.layers.Embedding(input_dim = len(words) + 2, output_dim = word_embedding, input_length = max_sentence_length, mask_zero = True)(input)
model = keras.layers.Bidirectional(keras.layers.LSTM(units = 50, return_sequences=True, recurrent_dropout=0.1))(model)
model = keras.layers.TimeDistributed(keras.layers.Dense(50, activation="relu"))(model)
crf = CRF(num_tags+1)  # CRF layer
out = crf(model)  # output

model = keras.models.Model(input, out)
model.compile(optimizer="rmsprop")

model.summary()

AttributeError: 'CRF' object has no attribute 'loss'

: 

In [None]:
checkpointer = keras.callbacks.ModelCheckpoint(filepath = 'model.h5',
                       verbose = 0,
                       mode = 'auto',
                       save_best_only = True,
                       monitor='val_loss')

In [28]:
history = model.fit(X_train, np.array(y_train), batch_size=batch_size, epochs=epochs, validation_split=0.1)

Epoch 1/10


ValueError: in user code:

    File "c:\Users\yazmi\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\training.py", line 1377, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\yazmi\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\training.py", line 1360, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\yazmi\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\training.py", line 1349, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\yazmi\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\training.py", line 1128, in train_step
        self._validate_target_and_loss(y, loss)
    File "c:\Users\yazmi\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\training.py", line 1092, in _validate_target_and_loss
        raise ValueError(

    ValueError: No loss found. You may have forgotten to provide a `loss` argument in the `compile()` method.
