### Trial 1 : RNN (LSTM) 
can handle sequential data (sentences) effectively
1. Embedding layer
2. LSTM layer (sequence processing)
3. Two dense layers for classification


In [26]:
# IMPORTS
import utils
import keras
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [27]:
if torch.cuda.is_available():
    device = torch.device('cuda')

device

device(type='cuda')

#### 1. Load cleaned dataset

In [28]:
dataset_path = './intent_detection_dataset/more_intents_pattern.json'
unique_intents, corpus, corpus_intents = utils.load_data(dataset_path)

# print shapes and sizes of the dataset
print('Number of unique intents:', len(unique_intents))
# print('Number of responses:', len(responses))
print('Number of examples:', len(corpus))
print('Number of examples:', len(corpus_intents))

# print samples of the dataset
print('Unique intents:', unique_intents)
# print('Responses:', responses)
print('Samples:', corpus[:5])

data = list(zip(corpus, corpus_intents))

print("Training Sample: ", data[0])

Number of unique intents: 24
Number of examples: 2870
Number of examples: 2870
Unique intents: ['Variable Declaration', 'Constant Declaration', 'Function Declaration', 'Class Declaration', 'Assignment Operation', 'Conditional Statement', 'For Loop', 'While Loop', 'Array Operation', 'Bitwise Operation', 'Mathematical Operation', 'Membership Operation', 'Casting', 'Input', 'Output', 'Assertion', 'Libraries', 'File System', 'IDE Operation', 'Comment', 'Activate Mouse', 'Activate Interactive', 'Interactive Commands', 'Git Operation']
Samples: ['make start time as double and initialize', 'declare min value as integer and value', 'define settings as boolean and value false', 'define y as integer and assign to', 'initialize k as string and initialize it with Code Review']
Training Sample:  ('make start time as double and initialize', 'Variable Declaration')


#### 2. Tokenizing and Embedding 

In [29]:
# create a tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<unk>')

# fit the tokenizer on the corpus -> updates internal vocabulary based on corpus
tokenizer.fit_on_texts(corpus)

# convert the corpus to sequences of integers -> each word is replaced by its index in the vocabulary for each sentence
sequences = tokenizer.texts_to_sequences(corpus)

# pad the sequences to the same length -> add padding tokens to the beginning of each sequence to fit the longest sequence
padded_sequences = keras.preprocessing.sequence.pad_sequences(sequences, padding='pre')

# get the number of unique words (vocabulary size)
vocab_size = len(tokenizer.word_index) + 1

# print the tokenizer properties
print('Vocabulary:', tokenizer.word_index)
print('Vocabulary Size:', vocab_size)
print('Shape of Input Sequence (# of examples, longest sequence length):', padded_sequences.shape)
print('Sample Input Sequence:', padded_sequences[0])

Vocabulary Size: 794
Shape of Input Sequence (# of examples, longest sequence length): (2870, 20)
Sample Input Sequence: [  0   0   0   0   0   0   0   0   0   0   0   0   0  31 116  62  18 138
   3  29]


#### 3. Feature Extraction
map intents to indices

In [30]:
# dictionary that maps each intent to a unique index
intent_to_index = {intent: index for index, intent in enumerate(unique_intents)}

# list for each sentence mapped to its corresponding intent index 
corpus_intent_mapped_to_index = [intent_to_index[intent] for intent in corpus_intents]

# the number of classes to classify a sentence into
number_of_classes = len(intent_to_index)

# convert intent_to_index to index_to_intent 
index_to_intent = {index: intent for intent, index in intent_to_index.items()} 

print(index_to_intent)

# one hot encoding for the intents -> length of each vector is equal to the number of classes
# each sequence in the dataset is represented as a one-hot encoded vector that represents the intent of the sequence
targets = keras.utils.to_categorical(corpus_intent_mapped_to_index, number_of_classes)

print('Categorial vector shape:', targets.shape)

{0: 'Variable Declaration', 1: 'Constant Declaration', 2: 'Function Declaration', 3: 'Class Declaration', 4: 'Assignment Operation', 5: 'Conditional Statement', 6: 'For Loop', 7: 'While Loop', 8: 'Array Operation', 9: 'Bitwise Operation', 10: 'Mathematical Operation', 11: 'Membership Operation', 12: 'Casting', 13: 'Input', 14: 'Output', 15: 'Assertion', 16: 'Libraries', 17: 'File System', 18: 'IDE Operation', 19: 'Comment', 20: 'Activate Mouse', 21: 'Activate Interactive', 22: 'Interactive Commands', 23: 'Git Operation'}
Categorial vector shape: (2870, 24)


In [31]:
x_train, x_test, y_train, y_test = train_test_split(padded_sequences, targets, test_size=0.2, random_state=42, shuffle=True)

train_data = list(zip(x_train, y_train))
test_data = list(zip(x_test, y_test))

print('Number of training examples:', len(train_data))
print('Number of testing examples:', len(test_data))

Number of training examples: 2296
Number of testing examples: 574


#### 4. Building the Model

In [32]:
# the input is the the padded sequences with the target value being the one-hot encoded intents
input_dimenstion = len(unique_intents)

# the output is the one-hot encoded intents
output_dimenstion = targets.shape[1]

print(f"Input Dimension: {input_dimenstion}, Output Dimension: {output_dimenstion}")

# Model description
# The model is a sequential model that consists of:
# 1. An embedding layer that converts the input sequences to dense vectors of fixed size
# 2. A Bidirectional LSTM layer that processes the input sequences in both directions
# 3. A Dense layer with 64 units and ReLU activation function
# 4. A Dropout layer with a dropout rate of 0.5
# 5. A Dense layer with the output dimension and softmax activation function for multi-class classification

# define parameters
epochs = 100

# the embedding dimension is the size of the vector for which each word is represented
# the embedding layer of a neural network, output_dim refers to the size of the dense vectors that the layer will generate for each input token (word). 
# essentially, it is the number of dimensions in which each word will be represented.
embedding_dimension = 128

# lstm units
lstm_units = 64

model = keras.Sequential([
    keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dimension),
    keras.layers.Bidirectional(keras.layers.LSTM(lstm_units, dropout=0.2)),
    keras.layers.Dense(lstm_units, activation='relu'),
    keras.layers.Dropout(0.4),
    keras.layers.Dense(output_dimenstion, activation='softmax')
])

optimizer = keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy', 'f1_score', 'precision', 'recall'])

model.summary()

Input Dimension: 24, Output Dimension: 24


#### 5. Train the Model

In [33]:
model.fit(x_train, y_train, epochs=epochs, verbose=1)

<keras.src.callbacks.history.History at 0x17ad277bac0>

In [34]:
predictions = model.predict(x_test)

predicted_intents = [index_to_intent[np.argmax(prediction)] for prediction in predictions]
true_intents = [index_to_intent[np.argmax(intent)] for intent in y_test]

accuracy = accuracy_score(true_intents, predicted_intents)
precision = precision_score(true_intents, predicted_intents, average='weighted')
recall = recall_score(true_intents, predicted_intents, average='weighted')
f1 = f1_score(true_intents, predicted_intents, average='weighted')

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step
Accuracy: 0.9895470383275261
Precision: 0.9897414496717633
Recall: 0.9895470383275261
F1 Score: 0.9895159962806788


In [35]:
model.evaluate(x_test, y_test)

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9903 - f1_score: 0.9545 - loss: 0.0894 - precision: 0.9903 - recall: 0.9903


[0.11303664743900299,
 0.9895470142364502,
 <tf.Tensor: shape=(24,), dtype=float32, numpy=
 array([0.9894736 , 1.        , 1.        , 1.        , 0.9166666 ,
        1.        , 0.9767441 , 1.        , 0.9545454 , 0.9756097 ,
        1.        , 1.        , 0.96551716, 1.        , 1.        ,
        1.        , 1.        , 0.9599999 , 0.9906541 , 0.9523809 ,
        1.        , 1.        , 1.        , 1.        ], dtype=float32)>,
 0.9895470142364502,
 0.9895470142364502]

#### 6. Testing the Model

In [51]:
user_input = input("Sentence:")

test_sequences = tokenizer.texts_to_sequences([user_input])

test_padded_sequences = keras.preprocessing.sequence.pad_sequences(test_sequences, padding='pre')

predictions = model.predict(test_padded_sequences)

predicted_intent_index = np.argmax(predictions)

predicted_intent = index_to_intent[predicted_intent_index]

print(f"Predicted Intent: {predicted_intent}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Predicted Intent: Input


In [52]:
model.save("./models/intent_detection_model.keras")