# Imports

In [86]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer

from Logistic_Regression import *
from collections import defaultdict, Counter
import math
# LabelBinarizer
from sklearn.preprocessing import LabelBinarizer



# Utilities

In [57]:

# Preprocessing function
    # Convert all to lowercase
    # Remove punctuations
    # Remove stopwords

stop_words = {'a', 'the', 'is', 'it', 'to', 'of', 'in', 'for', 'on', 'with', 'as', 'at', 'by', 'from',
                 'an', 'be', 'are', 'was', 'were', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she',
                   'we', 'they', 'me', 'him', 'her', 'us', 'them', 'my', 'your', 'his', 'its', 'our', 'their',
                     'mine', 'yours', 'his', 'hers', 'ours', 'theirs', 'what', 'which', 'who', 'whom', 'whose',
                       'where', 'when', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other',
                         'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'too', 'very'}
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    # text = ' '.join(word for word in text.split() if word not in stop_words)
    text = ' '.join(word for word in text.split())
    return text



# Load data & Pre-process

In [58]:

def load_intent_data(intent_data_path):
	# Load JSON file
	with open(intent_data_path, 'r') as file:
		data = json.load(file)
	# Convert to DataFrame
	rows = []
	for intent, sentences in data.items():
		for sentence in sentences:
			rows.append({'intent': intent, 'sentence': sentence})

	df = pd.DataFrame(rows)

	# Apply preprocessing
	df['processed_sentence'] = df['sentence'].apply(preprocess_text)

	# Display the DataFrame
	return df


In [81]:
df = load_intent_data('../intent_detection_dataset/final_intents_dataset.json')
df

Unnamed: 0,intent,sentence,processed_sentence
0,Variable Declaration,make start time as double and initialize 0.000123,make start time as double and initialize 0000123
1,Variable Declaration,declare min value as integer and value 131313,declare min value as integer and value 131313
2,Variable Declaration,define settings as boolean and value false,define settings as boolean and value false
3,Variable Declaration,define y as integer and assign to 12345,define y as integer and assign to 12345
4,Variable Declaration,initialize k as string and initialize it with ...,initialize k as string and initialize it with ...
...,...,...,...
2910,Mouse Click,click right,click right
2911,Mouse Click,left click the mouse,left click the mouse
2912,Mouse Click,right click the mouse,right click the mouse
2913,Mouse Click,click the mouse button,click the mouse button


# Feature Extraction

In [82]:

# Extract N-grams
def extract_ngrams(text, n=3):
    words = text.split()
    ngrams = []
    for i in range(1, n + 1):
        ngrams += [' '.join(words[j:j+i]) for j in range(len(words) - i + 1)]
    return ngrams

# Compute term frequencies
def compute_term_frequencies(corpus):
    term_frequencies = defaultdict(Counter)
    for idx, document in enumerate(corpus):
        ngrams = extract_ngrams(document)
        term_frequencies[idx] = Counter(ngrams)
    return term_frequencies

# Compute document frequencies
def compute_document_frequencies(term_frequencies):
    document_frequencies = Counter()
    for tf in term_frequencies.values():
        for term in tf.keys():
            document_frequencies[term] += 1
    return document_frequencies

# Compute TF-IDF
def compute_tfidf(corpus):
    term_frequencies = compute_term_frequencies(corpus)
    document_frequencies = compute_document_frequencies(term_frequencies)
    N = len(corpus)
    tfidf = defaultdict(dict)
    for idx, tf in term_frequencies.items():
        for term, count in tf.items():
            tfidf[idx][term] = (count / len(tf)) * math.log(N / (document_frequencies[term] + 1))
    return tfidf, document_frequencies

# Create TF-IDF matrix
def create_tfidf_matrix(tfidf, vocabulary):
    num_docs = len(tfidf)
    num_terms = len(vocabulary)
    # Initialize the matrix with zeros
    tfidf_matrix = np.zeros((num_docs, num_terms))
    # Populate the matrix with TF-IDF scores
    for doc_idx, term_scores in tfidf.items():
        for term, score in term_scores.items():
            term_idx = vocabulary.get(term)
            if term_idx is not None:
                tfidf_matrix[doc_idx, term_idx] = score
    return tfidf_matrix

# Prepare corpus
corpus = df['processed_sentence'].tolist()
# print(corpus)

# Compute TF-IDF scores
tfidf, document_frequencies  = compute_tfidf(corpus)
# print(tfidf[0])
# print(document_frequencies)

# Create vocabulary from all N-grams
all_ngrams = set()
for tf in tfidf.values():
    all_ngrams.update(tf.keys())

vocabulary = {term: idx for idx, term in enumerate(all_ngrams)}

# Create TF-IDF matrix
X = create_tfidf_matrix(tfidf, vocabulary)
y = df['intent']

print("TF-IDF Matrix:")
print(X.shape)
print("Shape of y:")
print(y.shape)


TF-IDF Matrix:
(2915, 15002)
Shape of y:
(2915,)


In [83]:
# Encode labels = 24
labels = df['intent'].unique()
# print(labels.shape)
label_to_index = {label: idx for idx, label in enumerate(labels)}
index_to_label = {idx: label for label, idx in label_to_index.items()}
y = np.array([label_to_index[label] for label in y])
# for i in range(len(y)):
# 	print(y[i])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(2332, 15002)
(2332,)
(583, 15002)
(583,)


# Model and training

In [84]:
# Function to predict intent of custom input
def predict_intent(custom_sentence, model):
    # Preprocess the custom input
    processed_sentence = preprocess_text(custom_sentence)
    # Convert to N-gram TF-IDF features
    ngrams = extract_ngrams(processed_sentence)
    custom_tfidf = np.zeros(len(vocabulary))
    term_counts = Counter(ngrams)
    N = len(corpus) + 1
    for term, count in term_counts.items():
        if term in vocabulary:
            term_idx = vocabulary[term]
            tf = count / len(ngrams)
            idf = math.log(N / (1 + document_frequencies.get(term, 0)))
            custom_tfidf[term_idx] = tf * idf
    # Predict the intent
    predicted_intent_idx = model.predict(np.array([custom_tfidf]))[0]
    predicted_intent = index_to_label[predicted_intent_idx]
    return predicted_intent

In [99]:

# Train a classifier (Logistic Regression)
lg_model = LogisticRegression()
lg_model.fit(X_train, y_train)

# Predict on test data
y_pred = lg_model.predict(X_test)

# Map index to label
y_test_labels = [index_to_label[idx] for idx in y_test]
y_pred_labels = [index_to_label[idx] for idx in y_pred]

# Evaluate the model
print("Accuracy:", accuracy_score(y_test_labels, y_pred_labels))
# print("Classification Report:\n", classification_report(y_test_labels, y_pred_labels))


Accuracy: 0.9407665505226481


# Test

In [100]:


# Test the model with custom input
custom_sentence = "i want a method named ADD "
predicted_intent = predict_intent(custom_sentence, lg_model)
print(f"Predicted intent for '{custom_sentence}':\n {predicted_intent}")



Predicted intent for 'i want a method named ADD ':
 Function Declaration


# Custom Logistic Regression

In [87]:

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

class MulticlassLogisticRegression:
    def __init__(self, lr: float, epochs: int, probability_threshold: float = 0.5, random_state=None):
        self.lr = lr  # The learning rate
        self.epochs = epochs  # The number of training epochs
        self.probability_threshold = probability_threshold  # Threshold for classification
        self.random_state = random_state  # Seed for reproducibility
        self.weights = []  # Store weights for each class

    def _prepare_input(self, X):
        if not isinstance(X, np.ndarray):
            X = np.array(X)
        # Add a new input with value 1 to each example (bias term)
        ones = np.ones((X.shape[0], 1), dtype=X.dtype)
        return np.concatenate((ones, X), axis=1)

    def _initialize(self, num_weights: int, stdev: float = 0.01):
        # Initialize the weights using a normally distributed random variable with a small standard deviation
        np.random.seed(self.random_state)
        return np.random.randn(num_weights) * stdev

    def _gradient(self, X, y, weights):
        # Compute and return the gradient of the weights with respect to the loss given X and y
        error = y - sigmoid(np.dot(X, weights))
        weight_gradient = np.dot(-X.T, error)
        return weight_gradient

    def _update(self, X, y, weights):
        # Apply a single iteration on the weights
        gradient = self._gradient(X, y, weights)
        weights -= self.lr * gradient
        return weights

    def fit(self, X, y):
        X = self._prepare_input(X)
        encoder = LabelBinarizer()
        y_oh = encoder.fit_transform(y)

        for i in range(y_oh.shape[1]):
            weights = self._initialize(X.shape[1])
            for _ in range(self.epochs):
                weights = self._update(X, y_oh[:, i], weights)
            self.weights.append(weights)
        return self

    def predict_proba(self, X):
        X = self._prepare_input(X)
        probas = []
        for weights in self.weights:
            proba = sigmoid(np.dot(X, weights))
            probas.append(proba)
        return np.array(probas).T

    def predict(self, X):
        probas = self.predict_proba(X)
        return np.argmax(probas, axis=1)
    

model = MulticlassLogisticRegression(lr=0.1, epochs=500, probability_threshold=0.5, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.9742710120068611


In [91]:
#Save the model
import pickle
with open('logisticRegressionModel2.pkl', 'wb') as file:
	pickle.dump(model, file)

#Load the model
# with open('logisticRegressionModel.pkl', 'rb') as file:
# 	model1 = pickle.load(file)
#Load the model
with open('logisticRegressionModel2.pkl', 'rb') as file:
	model2 = pickle.load(file)

# function save the model
def save_model(model, model_path):
	with open(model_path, 'wb') as file:
		pickle.dump(model, file)

# function load the model
def load_model(model_path):
	with open(model_path, 'rb') as file:
		model = pickle.load(file)
	return model

In [128]:
classRep = classification_report(y_test,y_pred)
print(classRep)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        48
           1       1.00      1.00      1.00        46
           2       1.00      1.00      1.00        23
           3       1.00      1.00      1.00         8
           4       1.00      0.83      0.91        12
           5       0.95      1.00      0.97        36
           6       0.95      1.00      0.98        21
           7       0.92      1.00      0.96        12
           8       1.00      0.95      0.98        22
           9       1.00      0.95      0.97        20
          10       0.95      1.00      0.97        77
          11       1.00      1.00      1.00        33
          12       0.94      1.00      0.97        15
          13       1.00      1.00      1.00        11
          14       1.00      1.00      1.00        17
          15       1.00      0.50      0.67         6
          16       1.00      0.75      0.86         4
          17       0.96    

In [97]:
assignment = ["assign x to y",
    "set the value of name to yasmine",
    "assign yasmine to name",
    "x equals y",
    "number is equal to 50",
    "max is equal 323.88",
    "assign max value to current value",
    "update max value with value"]
# Test the model with custom input
# custom_sentence = "create a file named 'test.txt'"
for sent in assignment:
	predicted_intent = predict_intent(sent, model2)
	print(f"Predicted intent: {predicted_intent}\n")

ide = ["select lines 1 through 5",
    "select line 9",
    "highlight from line 3 to line 7",
    "go to line 10",
    "delete line 6",
    "delete lines 2 through 4",
    "focus terminal",
    "kill terminal",
    "open terminal",
    "new terminal",
    "undo",
    "redo",
    "copy",
    "paste"]
# for sent in ide:
# 	predicted_intent = predict_intent(sent, model2)
# 	print(f"Predicted intent for ide:\n {predicted_intent}")
# predicted_intent = predict_intent(custom_sentence, model2)
# print(f"Predicted intent for '{custom_sentence}':\n {predicted_intent}")


Predicted intent: Mathematical Operation

Predicted intent: Mathematical Operation

Predicted intent: Mathematical Operation

Predicted intent: Assignment Operation

Predicted intent: Assignment Operation

Predicted intent: Conditional Statement

Predicted intent: Assignment Operation

Predicted intent: Output



In [101]:
test =[
    "i want a method named ADD ",
	"Declare function with name SUB ",
	"Declare variable and assign it to 3"
]
for sent in test:
	predicted_intent = predict_intent(sent, model2)
	print(f"Predicted intent: {predicted_intent}\n")

Predicted intent: Function Declaration

Predicted intent: Function Declaration

Predicted intent: Variable Declaration

