In [55]:
# Imports 
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer # preprocess text data to tokenize it into sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences # to pad tokenized sequence to a fixed length
import numpy as np
import random
import json
from utils import * 
import warnings
# ignore warnings
warnings.filterwarnings('ignore')

In [56]:
# load the dataset
with open('intents.json', 'r') as f:
    data = json.load(f)

print(data.keys())
print(type(data['intents']))
print(len(data['intents']))
print(data['intents'][0].keys())
print(data['intents'][-1])

dict_keys(['intents'])
<class 'list'>
15
dict_keys(['intent', 'keywords', 'responses', 'extension', 'entities'])
{'intent': 'File System', 'keywords': ['create a file', 'open file'], 'responses': ['File System intent detected'], 'extension': {'function': '', 'entities': False, 'responses': []}, 'entities': []}


In [57]:
# Data preprocessing
# list of intents
intents = []
unique_intents = []

text_input = [] # all text data to create a corpus

response_for_intent = {} # dictionary mapping intent with appropriate response

for intent in data['intents']:
    # print("Intent:",intent)
    # list of unique intents
    if intent['intent'] not in unique_intents:
        unique_intents.append(intent['intent'])

    for keyword in intent['keywords']:
        # cleaning is done before adding text to corpus
        # print(keyword, clean(keyword))
        text_input.append(clean(keyword))
        intents.append(intent['intent'])
    
    if intent['intent'] not in response_for_intent:
        response_for_intent[intent['intent']] = []
    
    for response in intent['responses']:
        response_for_intent[intent['intent']].append(response)

print("Unique Intents:", unique_intents)
print("Unique Intents:", len(unique_intents))
print("Intents :", intents)
print("Number of Intents:", len(intents))
print("Text Input:", text_input)
print('Length of text_input:', len(text_input))
print("Sample Response: ", response_for_intent)

Unique Intents: ['Declaration', 'Assignment Operation', 'Conditional Statement', 'Iterative Operation', 'Insertion', 'Array Operation', 'Operators', 'OOP Operations', 'Casting', 'Exception Handling', 'I/O Operations', 'Assertion', 'Libraries', 'IDE Interactions', 'File System']
Unique Intents: 15
Intents : ['Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Declaration', 'Assignment Operation', 'Assignment Operation', 'Assignment Operation', 'Assignment Operation', 'Assignment Operation', 'Assignment Operation', 'Assignment Operation', 'Assignment Operation', 'Assignment Operation', 'Assignmen

In [58]:
# Tokenization and Embedding
tokenizer = Tokenizer(filters='', oov_token='<unk>') # creates a tokenizer object
tokenizer.fit_on_texts(text_input) # creates the vocabulary of the tokenizer
sequences = tokenizer.texts_to_sequences(text_input) # maps the text to sequences of integers
print(sequences[:10]) # each sequence is a list of integers where each integer represents a word in the sentence
padded_sequences = pad_sequences(sequences, padding='pre') # pad the sequences to a fixed length for the model input
print('Shape of Input Sequence:', padded_sequences.shape)   # the shape of the input sequence is (number of keyword sentences, length of the longest sequence)
print(padded_sequences[:10])

[[51], [29], [29, 6], [29, 39], [77], [77, 3, 51, 39, 95], [30], [30, 6], [30, 39], [42]]
Shape of Input Sequence: (278, 16)
[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 51]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 29]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 29  6]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 29 39]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 77]
 [ 0  0  0  0  0  0  0  0  0  0  0 77  3 51 39 95]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 30]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 30  6]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 30 39]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 42]]


In [59]:
# Feature extraction
intent_to_index = {} # to map each intent to a unique index
index_to_intent = {} # to map each index to a unique intent
categorical_target = [] # similar to the intents list but with each intent replaced by its index
index = 0

for intent in intents: # intents is a list of intents for each keyword in the dataset (if we have 3 keywords for a single intent, the intent will be repeated 3 times in the list)
    if intent not in intent_to_index: # because we have multiple entries for the same intent
        print(intent)
        intent_to_index[intent] = index # map the intent to a unique index
        index_to_intent[index] = intent # map the index to the intent
        index += 1
    categorical_target.append(intent_to_index[intent]) # replace the intent with its index

print('Categorial Target:', categorical_target)
num_classes = len(intent_to_index) # number of possible classifications in our project (number of unique intents)
print('Number of Intents :', num_classes)
print(index_to_intent)

Declaration
Assignment Operation
Conditional Statement
Iterative Operation
Insertion
Array Operation
Operators
OOP Operations
Casting
Exception Handling
I/O Operations
Assertion
Libraries
IDE Interactions
File System
Categorial Target: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11

In [60]:
# One-Hot Encoding
categorical_vec = tf.keras.utils.to_categorical(categorical_target, num_classes=num_classes, dtype='int32') # this will convert the categorical target to one-hot encoded vector where each row is a vevtor (keyword sentence) and the 1 represents its classification (intent)
print('Shape of Categorial Vector', categorical_vec.shape) # the shape of the categorical vector is (number of keyword sentences, number of unique intents)
print(categorical_vec[:5])

Shape of Categorial Vector (278, 15)
[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [63]:
epochs = 100
embed_dim = 200
lstm_num = 50
output_dim = categorical_vec.shape[1]
input_dim = len(unique_intents)
print(f"Input Dimension :{input_dim}, Output Dimension :{output_dim}")

Input Dimension :15, Output Dimension :15


In [67]:
print(len(tokenizer.word_index) + 1)
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(len(tokenizer.word_index) + 1, embed_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_num, dropout=0.1)),
    tf.keras.layers.Dense(lstm_num, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(output_dim, activation='softmax')
])

optimizer = tf.optimizers.legacy.Adam(lr=0.001)
model.compile(optimizer=optimizer,
              loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

model.fit(padded_sequences, categorical_vec, epochs=epochs, verbose=0)

561
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 200)         112200    
                                                                 
 bidirectional_4 (Bidirecti  (None, 100)               100400    
 onal)                                                           
                                                                 
 dense_8 (Dense)             (None, 50)                5050      
                                                                 
 dropout_4 (Dropout)         (None, 50)                0         
                                                                 
 dense_9 (Dense)             (None, 15)                765       
                                                                 
Total params: 218415 (853.18 KB)
Trainable params: 218415 (853.18 KB)
Non-trainable params: 0 (0.00 Byte)
__________

<keras.src.callbacks.History at 0x1731dbb3a60>

In [72]:
# Evaluation of the model
# this should be the output from the speech recognition module
# test_text_inputs = ["declare and new variable",
#                     "iterate over the array",
#                     "compare between x and y",
#                     "if the values are equal",
#                     "define a function",
#                     "set the value of x to 10",
#                     "make a loop",
#                     "make a function"]

test_intents = ["Iterative Operation"]

test_sequences = tokenizer.texts_to_sequences(["loop through the entire array called arr"])
test_padded_sequences = pad_sequences(test_sequences, padding='pre')
print(test_padded_sequences)
test_labels = np.array([unique_intents.index(intent)
                       for intent in test_intents])
print(test_labels)
test_labels = tf.keras.utils.to_categorical(test_labels, num_classes=num_classes)
print(test_labels)
print(index_to_intent[np.where(test_labels[0]==1)[0][0]])
loss, accuracy = model.evaluate(test_padded_sequences, test_labels)

[[ 24  67   2   1  17 285   1]]
[3]
[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
Iterative Operation


In [73]:
# Prediction 
def response(sentence):
	sent_tokens = []
	
	# Split the input sentence into words
	words = sentence.split()
	
	# Convert words to their corresponding word indices
	for word in words:										 
		if word in tokenizer.word_index:
			sent_tokens.append(tokenizer.word_index[word])
		else:
			# Handle unknown words
			sent_tokens.append(tokenizer.word_index['<unk>'])
			
	sent_tokens = tf.expand_dims(sent_tokens, 0)
	
	#predict numerical category
	pred = model(sent_tokens) 
	
	#category to intent
	pred_class = np.argmax(pred.numpy(), axis=1)
				 
	# random response to that intent
	return random.choice(
		response_for_intent[index_to_intent[pred_class[0]]]), index_to_intent[pred_class[0]]


In [76]:
# Intent Recognition
print("Note: Enter 'quit' to break the loop.")   
while True:                                                
    query = input('You: ')
    if query.lower() == 'quit':
        break
    bot_response, type = response(query)
    print(f'Geek: {bot_response} -- TYPE: {type}\n')


Note: Enter 'quit' to break the loop.
Geek: assignment intent detected -- TYPE: Assignment Operation

Geek: iterative intent detected -- TYPE: Iterative Operation

Geek: Array intent detected -- TYPE: Array Operation

