### Trial 1 : RNN (LSTM) 
can handle sequential data (sentences) effectively
1. Embedding layer
2. LSTM layer (sequence processing)
3. Two dense layers for classification


In [1]:
# IMPORTS
import utils
import keras
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
# %pip install keras2onnx

In [3]:
if torch.cuda.is_available():
    device = torch.device('cuda')

device

device(type='cuda')

#### 1. Load cleaned dataset

In [4]:
import json

def load_data(file_path):
    '''
    This function loads the data from the file_path

    Given the path of the dataset this function reads and returns the intents and the corpus of the dataset

    Args:
        - file_path (str) : path of the dataset

    Returns:
        - unique_intents (list[str]) : list of unique intents in the dataset
        - corpus (list[str]) : list of all the sentences in the dataset
        - corpus_intents (list[str]) : list of intents for each sentence in the dataset
        - responses (list[str]) : list of responses for each intent in the dataset
    '''
    unique_intents = []
    corpus = []
    corpus_intents = []
    # responses = []

    with open(file_path, 'r') as f:
        dataset = json.load(f)

        print(dataset.keys())
        # intents = dataset['intents']

        for intent, values in dataset.items():
            print(intent.lower())
            print(len(values))
            if intent not in unique_intents:
                unique_intents.append(intent)
            for sentence in values:
                corpus.append(utils.clean(sentence))
                corpus_intents.append(intent)

    return unique_intents, corpus, corpus_intents

In [5]:
dataset_path = './intent_detection_dataset/final_intents_dataset.json'
unique_intents, corpus, corpus_intents = load_data(dataset_path)

# print shapes and sizes of the dataset
print('Number of unique intents:', len(unique_intents))
# print('Number of responses:', len(responses))
print('Number of examples:', len(corpus))
print('Number of examples:', len(corpus_intents))

# print samples of the dataset
print('Unique intents:', unique_intents)
# print('Responses:', responses)
print('Samples:', corpus[:5])

data = list(zip(corpus, corpus_intents))

print("Training Sample: ", data[0])

dict_keys(['Variable Declaration', 'Constant Declaration', 'Function Declaration', 'Class Declaration', 'Assignment Operation', 'Conditional Operation', 'For Loop', 'While Loop', 'Bitwise Operation', 'Mathematical Operation', 'Membership Operation', 'Casting', 'Input', 'Output', 'Assertion', 'Libraries', 'File System', 'IDE Operation', 'Comment', 'Activate Mouse', 'Activate Interactive', 'Interactive Commands', 'Git Operation', 'Exit Block'])
variable declaration
239
constant declaration
200
function declaration
90
class declaration
30
assignment operation
80
conditional operation
160
for loop
120
while loop
60
bitwise operation
80
mathematical operation
370
membership operation
160
casting
60
input
50
output
120
assertion
40
libraries
47
file system
150
ide operation
292
comment
50
activate mouse
40
activate interactive
30
interactive commands
220
git operation
80
exit block
25
Number of unique intents: 24
Number of examples: 2793
Number of examples: 2793
Unique intents: ['Variable De

#### 2. Tokenizing and Embedding 

In [6]:
# create a tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<unk>')

# fit the tokenizer on the corpus -> updates internal vocabulary based on corpus
tokenizer.fit_on_texts(corpus)

# convert the corpus to sequences of integers -> each word is replaced by its index in the vocabulary for each sentence
sequences = tokenizer.texts_to_sequences(corpus)

# pad the sequences to the same length -> add padding tokens to the beginning of each sequence to fit the longest sequence
padded_sequences = keras.preprocessing.sequence.pad_sequences(sequences, padding='pre')

# get the number of unique words (vocabulary size)
vocab_size = len(tokenizer.word_index) + 1

# print the tokenizer properties
print('Vocabulary:', tokenizer.word_index)
print('Vocabulary Size:', vocab_size)
print('Shape of Input Sequence (# of examples, longest sequence length):', padded_sequences.shape)
print('Sample Input Sequence:', padded_sequences[0])

Vocabulary Size: 765
Shape of Input Sequence (# of examples, longest sequence length): (2793, 20)
Sample Input Sequence: [  0   0   0   0   0   0   0   0   0   0   0   0   0  31  98  61  17 137
   3  29]


#### 3. Feature Extraction
map intents to indices

In [7]:
# dictionary that maps each intent to a unique index
intent_to_index = {intent: index for index, intent in enumerate(unique_intents)}

# list for each sentence mapped to its corresponding intent index 
corpus_intent_mapped_to_index = [intent_to_index[intent] for intent in corpus_intents]

# the number of classes to classify a sentence into
number_of_classes = len(intent_to_index)

# convert intent_to_index to index_to_intent 
index_to_intent = {index: intent for intent, index in intent_to_index.items()} 

print(index_to_intent)

# one hot encoding for the intents -> length of each vector is equal to the number of classes
# each sequence in the dataset is represented as a one-hot encoded vector that represents the intent of the sequence
targets = keras.utils.to_categorical(corpus_intent_mapped_to_index, number_of_classes)

print('Categorial vector shape:', targets.shape)

{0: 'Variable Declaration', 1: 'Constant Declaration', 2: 'Function Declaration', 3: 'Class Declaration', 4: 'Assignment Operation', 5: 'Conditional Operation', 6: 'For Loop', 7: 'While Loop', 8: 'Bitwise Operation', 9: 'Mathematical Operation', 10: 'Membership Operation', 11: 'Casting', 12: 'Input', 13: 'Output', 14: 'Assertion', 15: 'Libraries', 16: 'File System', 17: 'IDE Operation', 18: 'Comment', 19: 'Activate Mouse', 20: 'Activate Interactive', 21: 'Interactive Commands', 22: 'Git Operation', 23: 'Exit Block'}
Categorial vector shape: (2793, 24)


In [8]:
x_train, x_test, y_train, y_test = train_test_split(padded_sequences, targets, test_size=0.2, random_state=42, shuffle=True)

train_data = list(zip(x_train, y_train))
test_data = list(zip(x_test, y_test))

print('Number of training examples:', len(train_data))
print('Number of testing examples:', len(test_data))

Number of training examples: 2234
Number of testing examples: 559


#### 4. Building the Model

In [9]:
# the input is the the padded sequences with the target value being the one-hot encoded intents
input_dimenstion = len(unique_intents)

# the output is the one-hot encoded intents
output_dimenstion = targets.shape[1]

print(f"Input Dimension: {input_dimenstion}, Output Dimension: {output_dimenstion}")

# Model description
# The model is a sequential model that consists of:
# 1. An embedding layer that converts the input sequences to dense vectors of fixed size
# 2. A Bidirectional LSTM layer that processes the input sequences in both directions
# 3. A Dense layer with 64 units and ReLU activation function
# 4. A Dropout layer with a dropout rate of 0.5
# 5. A Dense layer with the output dimension and softmax activation function for multi-class classification

# define parameters
epochs = 75

# the embedding dimension is the size of the vector for which each word is represented
# the embedding layer of a neural network, output_dim refers to the size of the dense vectors that the layer will generate for each input token (word). 
# essentially, it is the number of dimensions in which each word will be represented.
embedding_dimension = 150

# lstm units
lstm_units = 64

model = keras.Sequential([
    keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dimension),
    keras.layers.Bidirectional(keras.layers.LSTM(lstm_units, dropout=0.2)),
    keras.layers.Dense(lstm_units, activation='relu'),
    keras.layers.Dropout(0.4),
    keras.layers.Dense(output_dimenstion, activation='softmax')
])

optimizer = keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy', 'f1_score', 'precision', 'recall'])

model.summary()

Input Dimension: 24, Output Dimension: 24


#### 5. Train the Model

In [10]:
model.fit(padded_sequences, targets, epochs=epochs, verbose=1)

Epoch 1/75
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.1331 - f1_score: 0.0411 - loss: 2.9527 - precision: 0.1684 - recall: 0.0015   
Epoch 2/75
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.4584 - f1_score: 0.2245 - loss: 1.7776 - precision: 0.8068 - recall: 0.2294
Epoch 3/75
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.7525 - f1_score: 0.4883 - loss: 0.8846 - precision: 0.9354 - recall: 0.6176
Epoch 4/75
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.8564 - f1_score: 0.6477 - loss: 0.4763 - precision: 0.9524 - recall: 0.7724
Epoch 5/75
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9227 - f1_score: 0.8282 - loss: 0.3015 - precision: 0.9623 - recall: 0.8598
Epoch 6/75
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.9649 - f1_score: 0.8965

<keras.src.callbacks.history.History at 0x1714b0bd360>

In [11]:
# x_train.shape

In [12]:
# print("Model Input Shape:", model.input_shape)
# print("X_test shape", x_test.shape)
# print(x_test.dtype)
# print(x_test)

In [13]:
# Run a simple prediction with a smaller subset
# sample_input = x_test[:1]  # Take a single sample
# print("Sample input shape:", sample_input.shape)
# print("Sample input shape:", sample_input.shape)
# sample_prediction = model.predict(sample_input)
# print("Sample prediction:", sample_prediction)

# # Check the model summary
# model.summary()

In [14]:
# try:
#     predictions = model.predict(x_test)
#     print("Predictions shape:", predictions.shape)
# except Exception as e:
#     print("Error during prediction on entire dataset:", e)
# print("Checking for NaNs in x_test:", np.isnan(x_test).any())
# print("Checking for infs in x_test:", np.isinf(x_test).any())
# print("x_test mean:", np.mean(x_test))
# print("x_test std deviation:", np.std(x_test))

In [15]:
# predictions = []
# for test_example in x_test:
#     test_example = test_example.reshape(1, -1)
#     print("Test example shape:", test_example.shape)
#     predictions = model.predict(test_example)

# print(x_test.shape)
# # x_test = x_test.reshape(x_test.shape[0], x_test.shape[1])
# # print(x_test.shape)
# # print(x_test.reshape(1, -1).shape)
# # x_test_final = []
# # for test in x_test:
# #     test = test.reshape(1, -1)
# #     print(test.shape)
# #     x_test_final.append(test)

# # print("X_test_final shape:", x_test_final[0].shape)

# print(x_test[0])
# print(x_test[:1])
# print(x_test[:1].shape)

# print(x_test.shape)

#### 6. Testing the Model

In [24]:
user = input("Sentence:")

test_sequences = tokenizer.texts_to_sequences([user])

test_padded_sequences = keras.preprocessing.sequence.pad_sequences(test_sequences, padding='pre')

predictions = model.predict(test_padded_sequences)

predicted_intent_index = np.argmax(predictions)

predicted_intent = index_to_intent[predicted_intent_index]

print(f"Predicted Intent: {predicted_intent}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Predicted Intent: Libraries


In [17]:
# model.save("./models/full_intent_detection_model.h5")
model.save("./models/full_intent_detection_model.keras")

In [18]:
# import tensorflow as tf
# import tf2onnx

# # Load your TensorFlow model
# model = tf.keras.models.load_model('models\intent_detection_model.keras')

# # Convert the model to ONNX format
# spec = (tf.TensorSpec((None, *model.input_shape[1:]), tf.float32, name="input"),)
# output_path = "../models/intent_detection.onnx"
# model_proto, _ = tf2onnx.convert.from_keras(model, input_signature=spec, output_path=output_path)

# # Save the ONNX model
# with open(output_path, "wb") as f:
#     f.write(model_proto.SerializeToString())


In [19]:
# import tensorflow as tf
# import tf2onnx
# import numpy as np

# # Define or load your Sequential model
# model = tf.keras.models.Sequential([
#     tf.keras.layers.Dense(10, activation='relu', input_shape=(784,)),
#     tf.keras.layers.Dense(10, activation='softmax')
# ])

# # Provide a dummy input to the model for shape inference
# spec = (tf.TensorSpec((None, 784), tf.float32, name="input"),)

# # Convert the model to ONNX format
# output_path = "path/to/save/model.onnx"
# # model_proto, _ = tf2onnx.convert.from_keras(model, input_signature=spec, output_path=output_path)
# model_proto, _ = tf2onnx.convert.from_keras(model, input_signature=spec, output_path=output_path)

# # Save the ONNX model
# with open(output_path, "wb") as f:
#     f.write(model_proto.SerializeToString())


In [20]:
loaded_model = tf.keras.models.load_model('models/full_intent_detection_model.keras')

user_input = 'output the variable name to the user'

test_sequences = tokenizer.texts_to_sequences([user_input])

test_padded_sequences = keras.preprocessing.sequence.pad_sequences(test_sequences, padding='pre')

predictions = loaded_model.predict(test_padded_sequences)

predicted_intent_index = np.argmax(predictions)

predicted_intent = index_to_intent[predicted_intent_index]

print(f"Predicted Intent: {predicted_intent}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 277ms/step
Predicted Intent: Output


In [21]:
loaded_model = tf.keras.models.load_model('models/full_intent_detection_model.keras')


In [22]:
sentences =[
    "bitwise x and y",
    "bitwise x or y",
    "perform bitwise and on number and 10",
    "perform bitwise or on number and 10",
    "shift left x by 2",
    "shift right x by 2",
    "shift x left by 2",
    "shift x right by 2",
    "xor x and y",
    "perform bitwise xor on x and y",
    "perform bitwise xor on x and 10"
]
for sentence in sentences:
    test_sequences = tokenizer.texts_to_sequences([sentence])

    test_padded_sequences = keras.preprocessing.sequence.pad_sequences(test_sequences, padding='pre')

    predictions = loaded_model.predict(test_padded_sequences)

    predicted_intent_index = np.argmax(predictions)

    predicted_intent = index_to_intent[predicted_intent_index]

    print(f"Predicted Intent: {predicted_intent}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 277ms/step
Predicted Intent: Bitwise Operation
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Predicted Intent: Bitwise Operation
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 410ms/step
Predicted Intent: Bitwise Operation
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Predicted Intent: Bitwise Operation
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Predicted Intent: Bitwise Operation
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Predicted Intent: Bitwise Operation
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Predicted Intent: Bitwise Operation
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Predicted Intent: Bitwise Operation
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Predicted Intent: Bitwise Operation
[1m1/1[0m [32m━━━━━━━━━