# Imports

In [171]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer

from Logistic_Regression import *

# Download stopwords
# nltk.download('stopwords')


# Utilities

In [172]:

# Preprocessing function
    # Convert all to lowercase
    # Remove punctuations
    # Remove stopwords

stop_words = {'a', 'the', 'is', 'it', 'to', 'of', 'in', 'for', 'on', 'with', 'as', 'at', 'by', 'from',
                 'an', 'be', 'are', 'was', 'were', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she',
                   'we', 'they', 'me', 'him', 'her', 'us', 'them', 'my', 'your', 'his', 'its', 'our', 'their',
                     'mine', 'yours', 'his', 'hers', 'ours', 'theirs', 'what', 'which', 'who', 'whom', 'whose',
                       'where', 'when', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other',
                         'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'too', 'very'}
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    # text = ' '.join(word for word in text.split() if word not in stop_words)
    text = ' '.join(word for word in text.split())
    return text

In [173]:
# Function to predict intent of custom input
def predict_intent(custom_sentence, model):
    processed_sentence = preprocess_text(custom_sentence)
    X_custom = vectorizer.transform([processed_sentence])
    predicted_intent = model.predict(X_custom)
    return predicted_intent[0]

# Load data & Pre-process

In [176]:

# Load JSON file
with open('../intent_detection_dataset/final_intents_dataset.json', 'r') as file:
    data = json.load(file)

# Convert to DataFrame
rows = []
for intent, sentences in data.items():
    for sentence in sentences:
        rows.append({'intent': intent, 'sentence': sentence})

df = pd.DataFrame(rows)

# Apply preprocessing
df['processed_sentence'] = df['sentence'].apply(preprocess_text)

#Display the DataFrame
df

Unnamed: 0,intent,sentence,processed_sentence
0,Variable Declaration,make start time as double and initialize 0.000123,make start time as double and initialize 0000123
1,Variable Declaration,declare min value as integer and value 131313,declare min value as integer and value 131313
2,Variable Declaration,define settings as boolean and value false,define settings as boolean and value false
3,Variable Declaration,define y as integer and assign to 12345,define y as integer and assign to 12345
4,Variable Declaration,initialize k as string and initialize it with ...,initialize k as string and initialize it with ...
...,...,...,...
2865,Git Operation,push changes,push changes
2866,Git Operation,stage changes,stage changes
2867,Git Operation,push changes,push changes
2868,Git Operation,stage changes,stage changes


# Feature Extraction

In [120]:
########### TF-IDF ###########

# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(df['processed_sentence'])
# y = df['intent']

########### N-Gram ###########
# vectorizer = CountVectorizer(ngram_range=(1, 2))  # Using unigrams and bigrams

# vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # Using unigrams and bigrams

# X = vectorizer.fit_transform(df['processed_sentence'])
# y = df['intent']



In [217]:
from collections import defaultdict, Counter
import math

# Extract N-grams
def extract_ngrams(text, n=3):
    words = text.split()
    ngrams = []
    for i in range(1, n + 1):
        ngrams += [' '.join(words[j:j+i]) for j in range(len(words) - i + 1)]
    return ngrams

# Compute term frequencies
def compute_term_frequencies(corpus):
    term_frequencies = defaultdict(Counter)
    for idx, document in enumerate(corpus):
        ngrams = extract_ngrams(document)
        term_frequencies[idx] = Counter(ngrams)
    return term_frequencies

# Compute document frequencies
def compute_document_frequencies(term_frequencies):
    document_frequencies = Counter()
    for tf in term_frequencies.values():
        for term in tf.keys():
            document_frequencies[term] += 1
    return document_frequencies

# Compute TF-IDF
def compute_tfidf(corpus):
    term_frequencies = compute_term_frequencies(corpus)
    document_frequencies = compute_document_frequencies(term_frequencies)
    N = len(corpus)
    tfidf = defaultdict(dict)
    for idx, tf in term_frequencies.items():
        for term, count in tf.items():
            tfidf[idx][term] = (count / len(tf)) * math.log(N / (document_frequencies[term] + 1))
    return tfidf, document_frequencies

# Create TF-IDF matrix
def create_tfidf_matrix(tfidf, vocabulary):
    tfidf_matrix = np.zeros((len(tfidf), len(vocabulary)))
    for doc_idx, term_scores in tfidf.items():
        for term, score in term_scores.items():
            term_idx = vocabulary.get(term)
            if term_idx is not None:
                tfidf_matrix[doc_idx, term_idx] = score
    return tfidf_matrix

# Prepare corpus
corpus = df['processed_sentence'].tolist()

# Compute TF-IDF scores
tfidf, document_frequencies  = compute_tfidf(corpus)

# Create vocabulary from all N-grams
all_ngrams = set()
for tf in tfidf.values():
    all_ngrams.update(tf.keys())

vocabulary = {term: idx for idx, term in enumerate(all_ngrams)}

# Create TF-IDF matrix
X = create_tfidf_matrix(tfidf, vocabulary)
y = df['intent']

# Find non-zero values in x
# for i in range(len(X)):
#     for j in range(len(X[i])):
#         if X[i][j] != 0:
#              print(X[i][j])
#     break

# print(y)

#shape of y
print(y.shape)


(2870,)


In [218]:
# Encode labels = 24
labels = df['intent'].unique()
# print(labels.shape)
label_to_index = {label: idx for idx, label in enumerate(labels)}
index_to_label = {idx: label for label, idx in label_to_index.items()}
y = np.array([label_to_index[label] for label in y])
# for i in range(len(y)):
# 	print(y[i])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

(24,)


In [213]:
# print(X_train.shape)
# print(X_test.shape)
# print(y_train.shape)
# print(y_test.shape)

# for i in range(len(y)):
#     print(y[i])

In [226]:
# make y_new that is a one hot encoding vector for labels of each example
y_new = np.zeros((len(y), len(labels)))
for i in range(len(y)):
	y_new[i][y[i]] = 1


# get the one index in y_new[0]
for i in range(len(y_new[239])):
	if y_new[239][i] == 1:
		print(index_to_label[i])
print (y_new.shape)


Variable Declaration
(2870, 24)


In [227]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_new, test_size=0.2, random_state=42)

# Model and training

In [228]:

# Train a classifier (Logistic Regression)
lg_model = LogisticRegression()
lg_model.fit(X_train, y_train)

# Predict on test data
y_pred = lg_model.predict(X_test)

# Map index to label
y_test_labels = [index_to_label[idx] for idx in y_test]
y_pred_labels = [index_to_label[idx] for idx in y_pred]

# Evaluate the model
print("Accuracy:", accuracy_score(y_test_labels, y_pred_labels))
# print("Classification Report:\n", classification_report(y_test_labels, y_pred_labels))


ValueError: y should be a 1d array, got an array of shape (2296, 24) instead.

# Test

In [155]:
# Function to predict intent of custom input
def predict_intent(custom_sentence):
    # Preprocess the custom input
    processed_sentence = preprocess_text(custom_sentence)
    # Convert to N-gram TF-IDF features
    ngrams = extract_ngrams(processed_sentence)
    custom_tfidf = np.zeros(len(vocabulary))
    term_counts = Counter(ngrams)
    N = len(corpus) + 1
    for term, count in term_counts.items():
        if term in vocabulary:
            term_idx = vocabulary[term]
            tf = count / len(ngrams)
            idf = math.log(N / (1 + document_frequencies.get(term, 0)))
            custom_tfidf[term_idx] = tf * idf
    # Predict the intent
    predicted_intent_idx = lg_model.predict([custom_tfidf])[0]
    predicted_intent = index_to_label[predicted_intent_idx]
    return predicted_intent

# Test the model with custom input
custom_sentence = "i want a method named ADD "
predicted_intent = predict_intent(custom_sentence)
print(f"Predicted intent for '{custom_sentence}':\n {predicted_intent}")

Predicted intent for 'i want a method named ADD ':
 Function Declaration


In [129]:
# # Function to predict intent of custom input
# def predict_intent(custom_sentence):
#     # Preprocess the custom input
#     processed_sentence = preprocess_text(custom_sentence)
#     # Convert to N-gram features
#     X_custom = vectorizer.transform([processed_sentence])
#     # Predict the intent
#     predicted_intent_idx = lg_model.predict(X_custom)[0]
#     predicted_intent = index_to_label[predicted_intent_idx]
#     return predicted_intent

# # Test the model with custom input
# custom_sentence = "assign x equals 5"
# predicted_intent = predict_intent(custom_sentence)
# print(f"Predicted intent for '{custom_sentence}':\n {predicted_intent}")

ValueError: X has 5936 features, but LogisticRegression is expecting 6090 features as input.

In [114]:
# Test the model with custom input
custom_sentence = "i want a method called predict entity"
predicted_intent = predict_intent(custom_sentence, lg_model)
print(f"Predicted intent for '{custom_sentence}': \n{predicted_intent}")

Predicted intent for 'i want a method called predict entity': 
IDE Operation


# SVM

In [83]:
from sklearn.svm import SVC
# Train the model
model = SVC()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.980836236933798
Classification Report:
                         precision    recall  f1-score   support

  Activate Interactive       1.00      1.00      1.00         6
        Activate Mouse       1.00      1.00      1.00         7
       Array Operation       1.00      0.86      0.93        22
             Assertion       1.00      0.83      0.91         6
  Assignment Operation       0.90      0.75      0.82        12
     Bitwise Operation       1.00      1.00      1.00        20
               Casting       0.94      1.00      0.97        15
     Class Declaration       1.00      1.00      1.00         8
               Comment       1.00      0.82      0.90        11
 Conditional Operation       0.97      0.97      0.97        36
  Constant Declaration       1.00      1.00      1.00        46
           File System       1.00      0.96      0.98        25
              For Loop       0.95      1.00      0.98        21
  Function Declaration       1.00      1.00      1.

In [87]:
# Test the model with custom input
#declare constant x as string
#i want to declare a function
custom_sentence = "i want a function called predict"
predicted_intent = predict_intent(custom_sentence, model)
print(f"Predicted intent for '{custom_sentence}': \n{predicted_intent}")

Predicted intent for 'i want a function called predict': 
Mathematical Operation


In [170]:
classRep = classification_report(y,y_pred)
print(classRep)

ValueError: Found input variables with inconsistent numbers of samples: [2870, 574]

# Custom Logistic Regression

In [169]:
# Convert JSON to DataFrame
rows = []
for intent, sentences in data.items():
    for sentence in sentences:
        rows.append({'intent': intent, 'sentence': sentence})

df = pd.DataFrame(rows)

# Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Apply preprocessing
df['processed_sentence'] = df['sentence'].apply(preprocess_text)

# Extract N-grams
def extract_ngrams(text, n=2):
    words = text.split()
    ngrams = []
    for i in range(1, n + 1):
        ngrams += [' '.join(words[j:j+i]) for j in range(len(words) - i + 1)]
    return ngrams

# Compute term frequencies
def compute_term_frequencies(corpus):
    term_frequencies = defaultdict(Counter)
    for idx, document in enumerate(corpus):
        ngrams = extract_ngrams(document)
        term_frequencies[idx] = Counter(ngrams)
    return term_frequencies

# Compute document frequencies
def compute_document_frequencies(term_frequencies):
    document_frequencies = Counter()
    for tf in term_frequencies.values():
        for term in tf.keys():
            document_frequencies[term] += 1
    return document_frequencies

# Compute TF-IDF
def compute_tfidf(corpus):
    term_frequencies = compute_term_frequencies(corpus)
    document_frequencies = compute_document_frequencies(term_frequencies)
    N = len(corpus)
    tfidf = defaultdict(dict)
    for idx, tf in term_frequencies.items():
        for term, count in tf.items():
            tfidf[idx][term] = (count / len(tf)) * math.log(N / (document_frequencies[term] + 1))
    return tfidf, document_frequencies

# Create TF-IDF matrix
def create_tfidf_matrix(tfidf, vocabulary):
    tfidf_matrix = np.zeros((len(tfidf), len(vocabulary)))
    for doc_idx, term_scores in tfidf.items():
        for term, score in term_scores.items():
            term_idx = vocabulary.get(term)
            if term_idx is not None:
                tfidf_matrix[doc_idx, term_idx] = score
    return tfidf_matrix

# Prepare corpus
corpus = df['processed_sentence'].tolist()

# Compute TF-IDF scores
tfidf, document_frequencies = compute_tfidf(corpus)

# Create vocabulary from all N-grams
all_ngrams = set()
for tf in tfidf.values():
    all_ngrams.update(tf.keys())

vocabulary = {term: idx for idx, term in enumerate(all_ngrams)}

# Create TF-IDF matrix
X = create_tfidf_matrix(tfidf, vocabulary)
y = df['intent']

# Encode labels
labels = df['intent'].unique()
label_to_index = {label: idx for idx, label in enumerate(labels)}
index_to_label = {idx: label for label, idx in label_to_index.items()}
y = np.array([label_to_index[label] for label in y])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Implementing Logistic Regression
class CustomLogisticRegression:
    def __init__(self, lr=0.01, num_iter=1000, fit_intercept=True, verbose=False):
        self.lr = lr
        self.num_iter = num_iter
        self.fit_intercept = fit_intercept
        self.verbose = verbose
    
    def __add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis=1)
    
    def __sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def __loss(self, h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
    
    def fit(self, X, y):
        if self.fit_intercept:
            X = self.__add_intercept(X)
        
        self.theta = np.zeros((X.shape[1], len(np.unique(y))))
        
        for i in range(self.num_iter):
            z = np.dot(X, self.theta)
            h = self.__sigmoid(z)
            gradient = np.dot(X.T, (h - self.__one_hot_encode(y))) / y.size
            self.theta -= self.lr * gradient
            
            if(self.verbose == True and i % 100 == 0):
                z = np.dot(X, self.theta)
                h = self.__sigmoid(z)
                print(f'Loss: {self.__loss(h, y)} \t')
    
    def predict_prob(self, X):
        if self.fit_intercept:
            X = self.__add_intercept(X)
    
        return self.__sigmoid(np.dot(X, self.theta))
    
    def predict(self, X):
        X = self.__add_intercept(X)
        return np.argmax(self.predict_prob(X), axis=1)
    
    def __one_hot_encode(self, y):
        n_values = np.max(y) + 1
        return np.eye(n_values)[y]

# Train the logistic regression model
model = CustomLogisticRegression(lr=0.1, num_iter=3000)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
y_pred_labels = [index_to_label[idx] for idx in y_pred]
y_test_labels = [index_to_label[idx] for idx in y_test]

# Evaluate the model
print("Accuracy:", accuracy_score(y_test_labels, y_pred_labels))
print("Classification Report:\n", classification_report(y_test_labels, y_pred_labels))

ValueError: shapes (574,6148) and (6147,24) not aligned: 6148 (dim 1) != 6147 (dim 0)

In [157]:
# Implementing Logistic Regression
class LogisticRegressionModel:
    def __init__(self, lr=0.01, num_iter=1000, fit_intercept=True, verbose=False):
        self.lr = lr
        self.num_iter = num_iter
        self.fit_intercept = fit_intercept
        self.verbose = verbose
    
    def __add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis=1)
    
    def __sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def __loss(self, h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
    
    def fit(self, X, y):
        if self.fit_intercept:
            X = self.__add_intercept(X)
        
        # weights initialization
        self.theta = np.zeros(X.shape[1])
        
        for i in range(self.num_iter):
            z = np.dot(X, self.theta)
            h = self.__sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / y.size
            self.theta -= self.lr * gradient
            
            if(self.verbose == True and i % 100 == 0):
                z = np.dot(X, self.theta)
                h = self.__sigmoid(z)
                print(f'Loss: {self.__loss(h, y)} \t')
    
    def predict_prob(self, X):
        if self.fit_intercept:
            X = self.__add_intercept(X)
    
        return self.__sigmoid(np.dot(X, self.theta))
    
    def predict(self, X, threshold=0.5):
        return self.predict_prob(X) >= threshold

# Train the logistic regression model
model = LogisticRegressionModel(lr=0.1, num_iter=3000)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
y_pred_labels = [index_to_label[idx] for idx in y_pred]
y_test_labels = [index_to_label[idx] for idx in y_test]

# Evaluate the model
print("Accuracy:", accuracy_score(y_test_labels, y_pred_labels))
print("Classification Report:\n", classification_report(y_test_labels, y_pred_labels))



Accuracy: 0.08013937282229965
Classification Report:
                         precision    recall  f1-score   support

  Activate Interactive       0.00      0.00      0.00         6
        Activate Mouse       0.00      0.00      0.00         7
       Array Operation       0.00      0.00      0.00        22
             Assertion       0.00      0.00      0.00         6
  Assignment Operation       0.00      0.00      0.00        12
     Bitwise Operation       0.00      0.00      0.00        20
               Casting       0.00      0.00      0.00        15
     Class Declaration       0.00      0.00      0.00         8
               Comment       0.00      0.00      0.00        11
 Conditional Operation       0.00      0.00      0.00        36
  Constant Declaration       0.08      1.00      0.15        46
           File System       0.00      0.00      0.00        25
              For Loop       0.00      0.00      0.00        21
  Function Declaration       0.00      0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


AttributeError: 'list' object has no attribute 'shape'

In [166]:
def predict_intent(custom_sentence, model, document_frequencies, vocabulary, N):
    # Preprocess the custom input
    processed_sentence = preprocess_text(custom_sentence)
    # Convert to N-gram TF-IDF features
    ngrams = extract_ngrams(processed_sentence)
    custom_tfidf = np.zeros(len(vocabulary))
    term_counts = Counter(ngrams)
    for term, count in term_counts.items():
        if term in vocabulary:
            term_idx = vocabulary[term]
            tf = count / len(ngrams)
            idf = math.log(N / (1 + document_frequencies.get(term, 0)))
            custom_tfidf[term_idx] = tf * idf
    # Reshape custom_tfidf to be a 2D array
    custom_tfidf = custom_tfidf.reshape(1, -1)
    # Predict the intent
    predicted_intent_idx = model.predict(custom_tfidf)[0]
    predicted_intent = index_to_label[predicted_intent_idx]
    return predicted_intent

# Test the model with custom input
custom_sentence = "import np as numpy"
predicted_intent = predict_intent(custom_sentence, model, document_frequencies, vocabulary, len(corpus) + 1)
print(f"Predicted intent for '{custom_sentence}':\n {predicted_intent}")


Predicted intent for 'import np as numpy':
 Constant Declaration
