In [1]:
import pandas as pd
import json
import plotly.graph_objects as go
from scipy.stats import entropy
from sklearn.model_selection import train_test_split
import spacy
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import random

In [13]:
with open('intents.json', 'r') as file:
    data = json.load(file)

In [14]:
intents = data['intents']
rows = []
for intent in intents:
    tag = intent['tag']
    patterns = intent['patterns']
    responses = intent['responses']
    
    if len(responses) == 1:
        response = responses[0]
        for pattern in patterns:
            rows.append({'tag': tag, 'pattern': pattern, 'response': response})
    elif len(patterns) > len(responses):
        extended_responses = random.choices(responses, k=len(patterns))
        
        for pattern, response in zip(patterns, extended_responses):
            rows.append({'tag': tag, 'pattern': pattern, 'response': response})
    else:
        for pattern, response in zip(patterns, responses):
            rows.append({'tag': tag, 'pattern': pattern, 'response': response})

df = pd.DataFrame(rows)

In [15]:
df.head()

Unnamed: 0,tag,pattern,response
0,greeting,Hi,Hello there. Tell me how are you feeling today?
1,greeting,Hey,Hi there. What brings you here today?
2,greeting,Is anyone there?,Hello there. Tell me how are you feeling today?
3,greeting,Hi there,Hi there. What brings you here today?
4,greeting,Hello,Hi there. What brings you here today?


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 808 entries, 0 to 807
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tag       808 non-null    object
 1   pattern   808 non-null    object
 2   response  808 non-null    object
dtypes: object(3)
memory usage: 19.1+ KB


In [17]:
print("Null Values:", df.isnull().sum())
print("Duplicate Values:", df.duplicated().sum())

Null Values: tag         0
pattern     0
response    0
dtype: int64
Duplicate Values: 2


In [18]:
labels = df['tag'].unique()
values = df['tag'].value_counts().tolist()

fig = go.Figure(data=[go.Bar(x=labels, y=values)])
fig.update_layout(title_text='Intent Distribution')
fig.show()

In [19]:
# Analyze pattern and response lengths
print("Pattern Length Statistics:")
print(df['pattern'].str.len().describe())

print("\nResponse Length Statistics:")
print(df['response'].str.len().describe())

Pattern Length Statistics:
count    808.000000
mean      32.315594
std       17.723210
min        0.000000
25%       18.000000
50%       30.000000
75%       45.250000
max      111.000000
Name: pattern, dtype: float64

Response Length Statistics:
count    808.000000
mean     181.813119
std      179.338730
min        9.000000
25%       50.750000
50%      105.000000
75%      272.750000
max      830.000000
Name: response, dtype: float64


In [20]:
unique_tags = df['tag'].nunique()
total_tags = len(df['tag'])

print(f"Total number of tag entries: {total_tags}")
print(f"Number of unique tags: {unique_tags}")

tag_probabilities = df['tag'].value_counts() / total_tags
tag_entropy = entropy(tag_probabilities)

print(f"\nIntent Distribution Entropy: {tag_entropy}")
print(f"Maximum Possible Entropy: {np.log(unique_tags)}") 

Total number of tag entries: 808
Number of unique tags: 78

Intent Distribution Entropy: 4.353074628092497
Maximum Possible Entropy: 4.356708826689592


In [21]:
# Show most frequent tags
tag_counts = df['tag'].value_counts()
print("\nMost frequent tags:")
print(tag_counts.head(10))


Most frequent tags:
tag
greeting           15
default            14
casual             13
user-meditation    12
ask                12
friends            12
stupid             12
wrong              12
sad                11
suicide            11
Name: count, dtype: int64


In [22]:
# Show the least frequent tags
tag_counts = df['tag'].value_counts()
print("\nLeast frequent tags:")
print(tag_counts.tail(10))


Least frequent tags:
tag
location          10
something-else    10
night             10
evening           10
morning           10
no-approach       10
learn-more        10
user-agree        10
meditation        10
fact-32           10
Name: count, dtype: int64


### Preprocessing

In [23]:
nlp = spacy.load("en_core_web_sm")

def spacy_tokenize(text):
    if not text or len(str(text).strip()) < 1:
        return ""
    
    doc = nlp(str(text).lower())
    
    new_text = [
        token.lemma_ for token in doc 
        if (len(token.text) > 0 and (token.is_alpha or token.text.isalnum())) and 
        not token.is_punct 
    ]
    
    processed_text = " ".join(new_text) if new_text else text
    return processed_text

df['pattern'] = df['pattern'].apply(spacy_tokenize)

In [24]:
label_encoder = LabelEncoder()
df['encoded_tag'] = label_encoder.fit_transform(df['tag'])

In [25]:
df

Unnamed: 0,tag,pattern,response,encoded_tag
0,greeting,hi,Hello there. Tell me how are you feeling today?,44
1,greeting,hey,Hi there. What brings you here today?,44
2,greeting,be anyone there,Hello there. Tell me how are you feeling today?,44
3,greeting,hi there,Hi there. What brings you here today?,44
4,greeting,hello,Hi there. What brings you here today?,44
...,...,...,...,...
803,fact-32,what be some common symptom of sadness,Sadness is a normal emotional response to loss...,36
804,fact-32,what be some common symptom of depression,Sadness is a normal emotional response to loss...,36
805,fact-32,how can I tell if I experience sadness or depr...,Sadness is a normal emotional response to loss...,36
806,fact-32,how can I differentiate between sadness and de...,Sadness is a normal emotional response to loss...,36


In [26]:
X = df['pattern']
y = df['encoded_tag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [27]:
print(f"Training size: {X_train.shape}")
print(f"Testing size: {X_test.shape}")

Training size: (565,)
Testing size: (243,)


In [29]:
pipeline = Pipeline([
('tfidf', TfidfVectorizer(max_features=5000)),
('classifier', RandomForestClassifier())
])

In [35]:
df['tag'].value_counts()

tag
greeting           15
default            14
casual             13
user-meditation    12
ask                12
                   ..
no-approach        10
learn-more         10
user-agree         10
meditation         10
fact-32            10
Name: count, Length: 78, dtype: int64