### Trial 1 : RNN (LSTM) 
can handle sequential data (sentences) effectively
1. Embedding layer
2. LSTM layer (sequence processing)
3. Two dense layers for classification


In [2]:
# IMPORTS
import utils
from tensorflow import keras
import numpy as np

#### 1. Load cleaned dataset

In [3]:
dataset_path = './intent_detection_dataset/intents_pattern.json'
unique_intents, corpus, corpus_intents = utils.load_data(dataset_path)

# print shapes and sizes of the dataset
print('Number of unique intents:', len(unique_intents))
# print('Number of responses:', len(responses))
print('Number of examples:', len(corpus))
print('Number of examples:', len(corpus_intents))

# print samples of the dataset
print('Unique intents:', unique_intents)
# print('Responses:', responses)
print('Samples:', corpus[:5])

Number of unique intents: 17
Number of examples: 1459
Number of examples: 1459
Unique intents: ['Variable Declaration', 'Function Declaration', 'Class Declaration', 'Assignment Operation', 'Conditional Statement', 'Iterative Statement', 'Array Operation', 'Bitwise Operation', 'Mathematical Operation', 'Membership Operation', 'Casting', 'IO Operation', 'Assertion', 'Libraries', 'File System', 'IDE Operation', 'Comments']
Samples: ['is approved equals clustering algorithms', 'file path equals', 'user id equals REGRESSION MODELS', 'temp equals car bus train plane bicycle', 'time elapsed equals']


#### 2. Tokenizing and Embedding 

In [4]:
# create a tokenizer
tokenizer = keras.preprocessing.text.Tokenizer(filters='', oov_token='<unk>')

# fit the tokenizer on the corpus -> updates internal vocabulary based on corpus
tokenizer.fit_on_texts(corpus)

# convert the corpus to sequences of integers -> each word is replaced by its index in the vocabulary for each sentence
sequences = tokenizer.texts_to_sequences(corpus)

# pad the sequences to the same length -> add padding tokens to the beginning of each sequence to fit the longest sequence
padded_sequences = keras.preprocessing.sequence.pad_sequences(sequences, padding='pre')

# get the number of unique words (vocabulary size)
vocab_size = len(tokenizer.word_index) + 1

# print the tokenizer properties
print('Vocabulary:', tokenizer.word_index)
print('Vocabulary Size:', vocab_size)
print('Shape of Input Sequence (# of examples, longest sequence length):', padded_sequences.shape)
print('Sample Input Sequence:', padded_sequences[0])

Vocabulary: {'<unk>': 1, 'the': 2, 'and': 3, 'a': 4, 'to': 5, 'is': 6, 'user': 7, 'in': 8, 'name': 9, 'value': 10, 'equal': 11, 'variable': 12, 'if': 13, 'list': 14, 'with': 15, 'for': 16, 'it': 17, 'check': 18, 'than': 19, 'new': 20, 'loop': 21, 'whether': 22, 'while': 23, 'array': 24, 'bitwise': 25, 'make': 26, 'not': 27, 'from': 28, 'as': 29, 'set': 30, 'parameters': 31, 'that': 32, 'type': 33, 'of': 34, 'create': 35, 'algorithm': 36, 'date': 37, 'equals': 38, 'time': 39, 'or': 40, 'product': 41, 'on': 42, 'define': 43, 'declare': 44, 'are': 45, 'initialize': 46, 'less': 47, 'write': 48, 'data': 49, 'under': 50, 'range': 51, 'assign': 52, 'step': 53, 'index': 54, 'return': 55, 'returns': 56, 'get': 57, 'class': 58, 'key': 59, 'greater': 60, 'id': 61, 'retrieve': 62, 'dictionary': 63, 'labelled': 64, 'iterate': 65, 'identified': 66, 'called': 67, 'config': 68, 'shift': 69, 'count': 70, 'use': 71, 'end': 72, 'amount': 73, 'contact': 74, 'method': 75, 'power': 76, 'add': 77, 'using': 7

#### 3. Feature Extraction
map intents to indices

In [5]:
# dictionary that maps each intent to a unique index
intent_to_index = {intent: index for index, intent in enumerate(unique_intents)}

# list for each sentence mapped to its corresponding intent index 
corpus_intent_mapped_to_index = [intent_to_index[intent] for intent in corpus_intents]

# the number of classes to classify a sentence into
number_of_classes = len(intent_to_index)

# convert intent_to_index to index_to_intent 
index_to_intent = {index: intent for intent, index in intent_to_index.items()} 

print(index_to_intent)

# one hot encoding for the intents -> length of each vector is equal to the number of classes
# each sequence in the dataset is represented as a one-hot encoded vector that represents the intent of the sequence
one_hot_encoded_intents = keras.utils.to_categorical(corpus_intent_mapped_to_index, number_of_classes)

print('Categorial vector shape:', one_hot_encoded_intents.shape)

{0: 'Variable Declaration', 1: 'Function Declaration', 2: 'Class Declaration', 3: 'Assignment Operation', 4: 'Conditional Statement', 5: 'Iterative Statement', 6: 'Array Operation', 7: 'Bitwise Operation', 8: 'Mathematical Operation', 9: 'Membership Operation', 10: 'Casting', 11: 'IO Operation', 12: 'Assertion', 13: 'Libraries', 14: 'File System', 15: 'IDE Operation', 16: 'Comments'}
Categorial vector shape: (1459, 17)


#### 4. Building the Model

In [6]:
# the input is the the padded sequences with the target value being the one-hot encoded intents
input_dimenstion = len(unique_intents)

# the output is the one-hot encoded intents
output_dimenstion = one_hot_encoded_intents.shape[1]

print(f"Input Dimension: {input_dimenstion}, Output Dimension: {output_dimenstion}")

# Model description
# The model is a sequential model that consists of:
# 1. An embedding layer that converts the input sequences to dense vectors of fixed size
# 2. A Bidirectional LSTM layer that processes the input sequences in both directions
# 3. A Dense layer with 64 units and ReLU activation function
# 4. A Dropout layer with a dropout rate of 0.5
# 5. A Dense layer with the output dimension and softmax activation function for multi-class classification

# define parameters
epochs = 100

# the embedding dimension is the size of the vector for which each word is represented
# the embedding layer of a neural network, output_dim refers to the size of the dense vectors that the layer will generate for each input token (word). 
# essentially, it is the number of dimensions in which each word will be represented.
embedding_dimension = 128

# lstm units
lstm_units = 64

model = keras.Sequential([
    keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dimension),
    keras.layers.Bidirectional(keras.layers.LSTM(lstm_units, dropout=0.2)),
    keras.layers.Dense(lstm_units, activation='relu'),
    keras.layers.Dropout(0.4),
    keras.layers.Dense(output_dimenstion, activation='softmax')
])

optimizer = keras.optimizers.Adam(lr=0.001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())


Input Dimension: 17, Output Dimension: 17





Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         82560     
                                                                 
 bidirectional (Bidirection  (None, 128)               98816     
 al)                                                             
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 17)                1105      
                                                                 
Total params: 190737 (745.07 KB)
Trainable params: 190737 (745.07 KB)
Non-trainable params: 0 (0.00 Byte)
________________

#### 5. Train the Model

In [7]:
model.fit(padded_sequences, one_hot_encoded_intents, epochs=epochs, verbose=0)













<keras.src.callbacks.History at 0x2040a4ea140>

#### 6. Testing the Model

In [12]:
user_input = input("Sentence:")

# actual_intent = input("Intent:")

test_sequences = tokenizer.texts_to_sequences([user_input])

test_padded_sequences = keras.preprocessing.sequence.pad_sequences(test_sequences, padding='pre')

predictions = model.predict(test_padded_sequences)

predicted_intent_index = np.argmax(predictions)

predicted_intent = index_to_intent[predicted_intent_index]

print(f"Predicted Intent: {predicted_intent}")

# loss, accuracy = model.evaluate(test_padded_sequences, np.array([actual_intent]))

Predicted Intent: Function Declaration
