In [3]:
import pandas as pd

df = pd.read_csv("hf://datasets/bitext/Bitext-customer-support-llm-chatbot-training-dataset/Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv")

In [4]:
df.head()

Unnamed: 0,flags,instruction,category,intent,response
0,B,question about cancelling order {{Order Number}},ORDER,cancel_order,I've understood you have a question regarding ...
1,BQZ,i have a question about cancelling oorder {{Or...,ORDER,cancel_order,I've been informed that you have a question ab...
2,BLQZ,i need help cancelling puchase {{Order Number}},ORDER,cancel_order,I can sense that you're seeking assistance wit...
3,BL,I need to cancel purchase {{Order Number}},ORDER,cancel_order,I understood that you need assistance with can...
4,BCELN,"I cannot afford this order, cancel purchase {{...",ORDER,cancel_order,I'm sensitive to the fact that you're facing f...


In [5]:
# Check dataset info
print("Dataset shape:", df.shape)
print("\nColumn names:", df.columns.tolist())
print("\nData types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum())
print("\nNumber of unique intents:", df['intent'].nunique())
print("\nIntent distribution:")
print(df['intent'].value_counts().head(10))

Dataset shape: (26872, 5)

Column names: ['flags', 'instruction', 'category', 'intent', 'response']

Data types:
flags          str
instruction    str
category       str
intent         str
response       str
dtype: object

Missing values:
flags          0
instruction    0
category       0
intent         0
response       0
dtype: int64

Number of unique intents: 27

Intent distribution:
intent
check_invoice               1000
complaint                   1000
contact_customer_service    1000
edit_account                1000
switch_account              1000
check_payment_methods        999
contact_human_agent          999
delivery_period              999
get_invoice                  999
newsletter_subscription      999
Name: count, dtype: int64


## Data Cleaning

In [6]:
import re

# Create a copy for cleaning
df_clean = df.copy()

# Remove placeholders like {{Order Number}}, {{Invoice Number}}, etc.
df_clean['instruction_clean'] = df_clean['instruction'].apply(
    lambda x: re.sub(r'\{\{.*?\}\}', '', x)
)

# Convert to lowercase
df_clean['instruction_clean'] = df_clean['instruction_clean'].str.lower()

# Remove extra whitespaces
df_clean['instruction_clean'] = df_clean['instruction_clean'].str.strip()
df_clean['instruction_clean'] = df_clean['instruction_clean'].apply(
    lambda x: re.sub(r'\s+', ' ', x)
)

# Remove any rows with empty instructions after cleaning
df_clean = df_clean[df_clean['instruction_clean'].str.len() > 0]

print("Dataset shape after cleaning:", df_clean.shape)
print("\nSample cleaned instructions:")
print(df_clean[['instruction', 'instruction_clean', 'intent']].head(10))

Dataset shape after cleaning: (26872, 6)

Sample cleaned instructions:
                                         instruction  \
0   question about cancelling order {{Order Number}}   
1  i have a question about cancelling oorder {{Or...   
2    i need help cancelling puchase {{Order Number}}   
3         I need to cancel purchase {{Order Number}}   
4  I cannot afford this order, cancel purchase {{...   
5     can you help me cancel order {{Order Number}}?   
6  I can no longer afford order {{Order Number}},...   
7    I am trying to cancel purchase {{Order Number}}   
8     I have got to cancel purchase {{Order Number}}   
9    i need help canceling purchase {{Order Number}}   

                             instruction_clean        intent  
0              question about cancelling order  cancel_order  
1    i have a question about cancelling oorder  cancel_order  
2               i need help cancelling puchase  cancel_order  
3                    i need to cancel purchase  cancel_order

## Intent Classification Model using NLP + Logistic Regression

In [7]:
# Install required packages if not already installed
import subprocess
import sys

packages = ['scikit-learn', 'nltk']
for package in packages:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', package])

In [8]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

print("Libraries imported successfully!")

Libraries imported successfully!


In [9]:
# Prepare features and labels
X = df_clean['instruction_clean']
y = df_clean['intent']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")
print(f"\nClass distribution in training set:")
print(y_train.value_counts())

Training set size: 21497
Testing set size: 5375

Class distribution in training set:
intent
check_invoice               800
contact_customer_service    800
switch_account              800
complaint                   800
edit_account                800
payment_issue               799
delivery_period             799
contact_human_agent         799
get_invoice                 799
check_payment_methods       799
registration_problems       799
newsletter_subscription     799
track_refund                798
cancel_order                798
place_order                 798
check_refund_policy         798
get_refund                  798
review                      798
set_up_shipping_address     798
change_order                798
create_account              798
track_order                 796
delivery_options            796
delete_account              796
recover_password            796
change_shipping_address     778
check_cancellation_fee      760
Name: count, dtype: int64


In [10]:
# Create TF-IDF Vectorizer
# TF-IDF converts text to numerical features
# - removes english stopwords (common words like 'the', 'is', 'and')
# - uses unigrams and bigrams (1 and 2 word phrases)
# - limits to top 5000 features

tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english',
    min_df=2,
    max_df=0.9
)

# Transform training and testing data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("TF-IDF Vectorization completed!")
print(f"Training data shape: {X_train_tfidf.shape}")
print(f"Testing data shape: {X_test_tfidf.shape}")
print(f"Number of features: {len(tfidf_vectorizer.get_feature_names_out())}")

TF-IDF Vectorization completed!
Training data shape: (21497, 5000)
Testing data shape: (5375, 5000)
Number of features: 5000


In [12]:
# Train Logistic Regression model
print("Training Logistic Regression model...")

logistic_model = LogisticRegression(
    max_iter=1000,
    random_state=42,
    solver='lbfgs'
)

logistic_model.fit(X_train_tfidf, y_train)

print("Model training completed!")

Training Logistic Regression model...
Model training completed!


## Model Evaluation

In [13]:
# Make predictions on test set
y_pred = logistic_model.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Get training accuracy
y_train_pred = logistic_model.predict(X_train_tfidf)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")

Model Accuracy: 0.9769 (97.69%)
Training Accuracy: 0.9860 (98.60%)


In [17]:
df['intent'].value_counts()

intent
check_invoice               1000
complaint                   1000
contact_customer_service    1000
edit_account                1000
switch_account              1000
check_payment_methods        999
contact_human_agent          999
delivery_period              999
get_invoice                  999
newsletter_subscription      999
payment_issue                999
registration_problems        999
cancel_order                 998
place_order                  998
track_refund                 998
change_order                 997
check_refund_policy          997
create_account               997
get_refund                   997
review                       997
set_up_shipping_address      997
delete_account               995
delivery_options             995
recover_password             995
track_order                  995
change_shipping_address      973
check_cancellation_fee       950
Name: count, dtype: int64

In [14]:
# Detailed classification report
print("Classification Report:")
print("="*80)
print(classification_report(y_test, y_pred, target_names=logistic_model.classes_))

Classification Report:
                          precision    recall  f1-score   support

            cancel_order       0.99      0.98      0.99       200
            change_order       0.96      0.94      0.95       199
 change_shipping_address       0.99      1.00      0.99       195
  check_cancellation_fee       1.00      1.00      1.00       190
           check_invoice       0.82      0.86      0.84       200
   check_payment_methods       1.00      1.00      1.00       200
     check_refund_policy       1.00      0.99      1.00       199
               complaint       1.00      1.00      1.00       200
contact_customer_service       1.00      0.98      0.99       200
     contact_human_agent       0.99      0.99      0.99       200
          create_account       0.99      0.97      0.98       199
          delete_account       0.94      0.99      0.97       199
        delivery_options       0.92      1.00      0.96       199
         delivery_period       1.00      0.99      1

## Test the Model with Sample Predictions

In [15]:
def predict_intent(text):
    """
    Predict the intent of a given text
    
    Args:
        text: Input text to classify
        
    Returns:
        Predicted intent and confidence scores
    """
    # Clean the text (same as training data)
    text_clean = re.sub(r'\{\{.*?\}\}', '', text)
    text_clean = text_clean.lower().strip()
    text_clean = re.sub(r'\s+', ' ', text_clean)
    
    # Transform using TF-IDF
    text_tfidf = tfidf_vectorizer.transform([text_clean])
    
    # Predict
    prediction = logistic_model.predict(text_tfidf)[0]
    probabilities = logistic_model.predict_proba(text_tfidf)[0]
    
    # Get top 3 predictions with probabilities
    top_indices = probabilities.argsort()[-3:][::-1]
    top_intents = [(logistic_model.classes_[i], probabilities[i]) for i in top_indices]
    
    print(f"Input: {text}")
    print(f"Cleaned: {text_clean}")
    print(f"\nPredicted Intent: {prediction}")
    print(f"Confidence: {max(probabilities):.4f}")
    print(f"\nTop 3 predictions:")
    for intent, prob in top_intents:
        print(f"  - {intent}: {prob:.4f}")
    print("-" * 80)

In [16]:
# Test with sample customer queries
test_queries = [
    "I want to cancel my order",
    "How can I track my package?",
    "I need help setting up my account",
    "What are the delivery options available?",
    "I forgot my password, please help",
    "Can you send me my invoice?",
    "I want to change my shipping address",
    "How do I get a refund?"
]

print("Testing Intent Classifier with Sample Queries")
print("=" * 80)
for query in test_queries:
    predict_intent(query)
    print()

Testing Intent Classifier with Sample Queries
Input: I want to cancel my order
Cleaned: i want to cancel my order

Predicted Intent: cancel_order
Confidence: 0.9439

Top 3 predictions:
  - cancel_order: 0.9439
  - delivery_options: 0.0091
  - delete_account: 0.0077
--------------------------------------------------------------------------------

Input: How can I track my package?
Cleaned: how can i track my package?

Predicted Intent: track_refund
Confidence: 0.3129

Top 3 predictions:
  - track_refund: 0.3129
  - track_order: 0.1774
  - delivery_period: 0.0959
--------------------------------------------------------------------------------

Input: I need help setting up my account
Cleaned: i need help setting up my account

Predicted Intent: delete_account
Confidence: 0.1686

Top 3 predictions:
  - delete_account: 0.1686
  - switch_account: 0.1349
  - create_account: 0.1220
--------------------------------------------------------------------------------

Input: What are the delivery o

## Model Summary

### Key Achievements:
1. **Data Cleaning**: Successfully cleaned 26,872 customer support queries by:
   - Removing placeholders ({{Order Number}}, etc.)
   - Converting to lowercase
   - Removing extra whitespaces

2. **Feature Engineering**: Used TF-IDF Vectorization with:
   - 5000 features
   - Unigrams and bigrams (1-2 word phrases)
   - English stopwords removed

3. **Model Performance**:
   - **Test Accuracy: 97.69%**
   - **Training Accuracy: 98.60%**
   - Successfully classifies 27 different customer intents

4. **Model Components**:
   - **NLP Technique**: TF-IDF (Term Frequency-Inverse Document Frequency)
   - **Classifier**: Logistic Regression
   - **Dataset**: Bitext Customer Support Dataset (26,872 examples)

### Use Case:
This model can automatically classify customer support queries into 27 different intents, enabling:
- Automated routing to appropriate support teams
- Quick response suggestions
- Better customer service analytics

## Save Model and Vectorizer

In [18]:
import pickle
import os

# Create a models directory if it doesn't exist
models_dir = '../models'
os.makedirs(models_dir, exist_ok=True)

# Save the TF-IDF vectorizer
vectorizer_path = os.path.join(models_dir, 'tfidf_vectorizer.pkl')
with open(vectorizer_path, 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)
print(f"✓ TF-IDF Vectorizer saved to: {vectorizer_path}")

# Save the Logistic Regression model
model_path = os.path.join(models_dir, 'logistic_regression_model.pkl')
with open(model_path, 'wb') as f:
    pickle.dump(logistic_model, f)
print(f"✓ Logistic Regression Model saved to: {model_path}")

print("\nBoth models saved successfully!")

✓ TF-IDF Vectorizer saved to: ../models\tfidf_vectorizer.pkl
✓ Logistic Regression Model saved to: ../models\logistic_regression_model.pkl

Both models saved successfully!


### Load Saved Models (for future use)

In [None]:
# Example: How to load the saved models later
# Uncomment and run this code when you want to load the models

# import pickle
# 
# # Load TF-IDF Vectorizer
# with open('../models/tfidf_vectorizer.pkl', 'rb') as f:
#     loaded_vectorizer = pickle.load(f)
# 
# # Load Logistic Regression Model
# with open('../models/logistic_regression_model.pkl', 'rb') as f:
#     loaded_model = pickle.load(f)
# 
# print("Models loaded successfully!")
# 
# # Test with a sample query
# test_text = "I want to cancel my order"
# test_text_clean = test_text.lower().strip()
# test_tfidf = loaded_vectorizer.transform([test_text_clean])
# prediction = loaded_model.predict(test_tfidf)[0]
# print(f"Prediction: {prediction}")