In [1]:
!pip install pandas --quiet
!pip install matplotlib --quiet
!pip install seaborn --quiet
!pip install tensorflow --quiet
!pip install scikit-learn --quiet


In [2]:
!pip install numpy --quiet

In [4]:
import numpy as np
import pandas as pd
import warnings
import os

In [8]:
warnings.filterwarnings('ignore')

In [10]:
ROOT_PATH = r"C:\Talk Back AI Assistant\Intent Classification"
DATA_PATH = os.path.join(ROOT_PATH, 'Datasets')

In [11]:
dataset_path = os.path.join(DATA_PATH, 'toy_set.csv')
df = pd.read_csv(dataset_path)
df.sample(5)

Unnamed: 0,intent,text
9,open_app,Open PowerPoint
51,open_website,Open GitHub
55,exit,Stop
48,open_folder,Go to Music folder
38,make_note,Add note about project


In [12]:
df.head()

Unnamed: 0,intent,text
0,open_app,Open Chrome
1,open_app,Launch Spotify
2,open_app,Start Notepad
3,open_app,Open Calculator
4,open_app,Launch Word


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   intent  58 non-null     object
 1   text    58 non-null     object
dtypes: object(2)
memory usage: 1.0+ KB


On text data logistic Regressor Linear SVM and Naive byes i.e. Multinomial NB works better and since this is a model which will be working on a smaller datatset it will be that much more effective

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

texts = df['text'].copy()
intents = df['intent'].copy()

In [17]:
vectorizer = TfidfVectorizer(
    lowercase = True,   # convert all to lowercase
    stop_words = "english", # remove common stopwords
    ngram_range = (1,2)  # unigrams + bigrams
)

In [18]:
X = vectorizer.fit_transform(texts)

print("Feature shape:", X.shape)
print("Sample features:\n", X.toarray()[0])
print("Vocabulary size:", len(vectorizer.vocabulary_))

Feature shape: (58, 147)
Sample features:
 [0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.65534459 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.37556215
 0.         0.65534459 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.  

In [25]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(
    X,
    intents,
    test_size = 0.2,
    stratify = intents
) 

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
print('Classification Report: \n', classification_report(y_test, lr_pred))

Classification Report: 
               precision    recall  f1-score   support

        exit       0.00      0.00      0.00         1
   make_note       0.00      0.00      0.00         2
    open_app       0.67      1.00      0.80         2
 open_folder       1.00      1.00      1.00         1
open_website       0.00      0.00      0.00         1
  search_web       0.40      1.00      0.57         2
   tell_date       1.00      1.00      1.00         1
   tell_time       1.00      1.00      1.00         2

    accuracy                           0.67        12
   macro avg       0.51      0.62      0.55        12
weighted avg       0.51      0.67      0.56        12



In [30]:
# Data Augmentation

from nltk.corpus import wordnet
import random

def synonym_replace(text, n=1):
    words = text.split()
    new_words = words.copy()
    for _ in range(n):
        idx = random.choice(range(len(words)))
        synonyms = wordnet.synsets(words[idx])
        if synonyms:
            lemmas = [l.name() for s in synonyms for l in s.lemmas()]
            lemmas = [l.replace('_', ' ') for l in lemmas if l != words[idx]]
            if lemmas:
                new_words[idx] = random.choice(lemmas)
    return ' '.join(new_words)


In [29]:
synonyms = texts.apply(synon)