In [12]:
pip install faker


Note: you may need to restart the kernel to use updated packages.


In [13]:
import pandas as pd
import random
import json
from faker import Faker

# Initialize Faker for realistic data generation
fake = Faker()

# Define our intents and sample queries
INTENT_TEMPLATES = {
    'order_status': [
        ("What's the status of order #{order_id}?", {'order_id': '{order_id}'}),
        ("Can you check where order {order_id} is?", {'order_id': '{order_id}'}),
        ("Has order #{order_id} shipped yet?", {'order_id': '{order_id}'})
    ],
    'product_inquiry': [
        ("When will {product} be back in stock?", {'product': '{product}'}),
        ("Do you have {product} in {color}?", {'product': '{product}', 'color': '{color}'}),
        ("What are the specs for the {product}?", {'product': '{product}'})
    ],
    'refund_request': [
        ("I want to return my {product}", {'product': '{product}'}),
        ("The {product} arrived damaged, I need a refund", {'product': '{product}', 'issue': 'damaged'}),
        ("How do I return {product}?", {'product': '{product}'})
    ],
    'account_help': [
        ("I can't login to my account", {}),
        ("My password isn't working", {}),
        ("How do I reset my password?", {})
    ],
    'technical_support': [
        ("The app crashes when I {action}", {'action': '{action}'}),
        ("I'm getting error {error_code} when trying to {action}", 
         {'error_code': '{error_code}', 'action': '{action}'}),
        ("The website won't let me {action}", {'action': '{action}'})
    ]
}

# Define possible values for placeholders
PRODUCTS = ['iPhone', 'Samsung Galaxy', 'MacBook Pro', 'AirPods', 'PlayStation 5', 
            'Nike Air Force', 'Dyson vacuum', 'Instant Pot', 'Kindle', 'Fitbit']
COLORS = ['black', 'white', 'blue', 'red', 'silver', 'space gray']
ACTIONS = ['checkout', 'login', 'add to cart', 'view my orders', 'update payment info']
ERROR_CODES = ['404', '500', 'ERR_CONNECTION_REFUSED', 'ERR_TIMEOUT']

def generate_synthetic_data(num_samples=1000):
    samples = []
    
    for _ in range(num_samples):
        # Select a random intent
        intent = random.choice(list(INTENT_TEMPLATES.keys()))
        # Select a random template for this intent
        template, entities = random.choice(INTENT_TEMPLATES[intent])
        
        # Replace placeholders with realistic values
        query = template
        entity_values = {}
        
        if '{order_id}' in template:
            order_id = fake.bothify(text='??#####').upper()
            query = query.replace('{order_id}', order_id)
            if entities.get('order_id') == '{order_id}':
                entity_values['order_id'] = order_id
        
        if '{product}' in template:
            product = random.choice(PRODUCTS)
            query = query.replace('{product}', product)
            if entities.get('product') == '{product}':
                entity_values['product'] = product
                
        if '{color}' in template:
            color = random.choice(COLORS)
            query = query.replace('{color}', color)
            if entities.get('color') == '{color}':
                entity_values['color'] = color
                
        if '{action}' in template:
            action = random.choice(ACTIONS)
            query = query.replace('{action}', action)
            if entities.get('action') == '{action}':
                entity_values['action'] = action
                
        if '{error_code}' in template:
            error_code = random.choice(ERROR_CODES)
            query = query.replace('{error_code}', error_code)
            if entities.get('error_code') == '{error_code}':
                entity_values['error_code'] = error_code
        
        # Add some natural language variations
        query = add_natural_variations(query)
        
        samples.append({
            'text': query,
            'intent': intent,
            'entities': entity_values
        })
    
    return pd.DataFrame(samples)

def add_natural_variations(text):
    """Add natural language variations to make queries more realistic"""
    variations = [
        ("Can you tell me ", ""),
        ("I was wondering ", ""),
        ("", " please"),
        ("", " thanks"),
        ("", " thank you"),
        ("Hi, ", ""),
        ("Hello, ", ""),
        ("Hey, ", ""),
        ("", "?"),
        ("", "..."),
    ]
    prefix, suffix = random.choice(variations)
    return prefix + text + suffix

# Generate our dataset
df = generate_synthetic_data(1500)

# Save to CSV
df.to_csv('synthetic_customer_support.csv', index=False)
print(f"Generated dataset with {len(df)} samples")

Generated dataset with 1500 samples


In [14]:
# Load the dataset
df = pd.read_csv('synthetic_customer_support.csv')

# Convert string representation of entities to dictionary
import ast
df['entities'] = df['entities'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Show distribution of intents
print("\nIntent Distribution:")
print(df['intent'].value_counts())

# Show some examples
print("\nSample Queries:")
for _, row in df.sample(5).iterrows():
    print(f"\nQuery: {row['text']}")
    print(f"Intent: {row['intent']}")
    print(f"Entities: {row['entities']}")


Intent Distribution:
intent
order_status         307
technical_support    306
account_help         305
product_inquiry      299
refund_request       283
Name: count, dtype: int64

Sample Queries:

Query: What's the status of order #GC19605? please
Intent: order_status
Entities: {'order_id': 'GC19605'}

Query: Hey, How do I return Dyson vacuum?
Intent: refund_request
Entities: {'product': 'Dyson vacuum'}

Query: Do you have Nike Air Force in silver? thanks
Intent: product_inquiry
Entities: {'product': 'Nike Air Force', 'color': 'silver'}

Query: How do I return Nike Air Force? please
Intent: refund_request
Entities: {'product': 'Nike Air Force'}

Query: Hi, How do I reset my password?
Intent: account_help
Entities: {}


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['intent'], test_size=0.2, random_state=42
)

# Vectorize text
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train classifier
clf = LogisticRegression(max_iter=1000, multi_class='ovr')
clf.fit(X_train_vec, y_train)

# Evaluate
y_pred = clf.predict(X_test_vec)
print("\nIntent Classification Report:")
print(classification_report(y_test, y_pred))

# Save the model
import joblib
joblib.dump(clf, 'intent_classifier.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')


Intent Classification Report:
                   precision    recall  f1-score   support

     account_help       1.00      1.00      1.00        54
     order_status       1.00      1.00      1.00        67
  product_inquiry       1.00      1.00      1.00        56
   refund_request       1.00      1.00      1.00        65
technical_support       1.00      1.00      1.00        58

         accuracy                           1.00       300
        macro avg       1.00      1.00      1.00       300
     weighted avg       1.00      1.00      1.00       300



['tfidf_vectorizer.joblib']

In [30]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# 1. Load and prepare your dataset
df = pd.read_csv('synthetic_customer_support.csv')
texts = df['text'].values
intents = df['intent'].values

# Convert labels to numerical values
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(intents)
num_classes = len(label_encoder.classes_)

# 2. Create a PyTorch Dataset class
class IntentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 3. Initialize tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_classes
)

# 4. Create train dataset and dataloader
train_dataset = IntentDataset(texts, encoded_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# 5. Training setup
optimizer = AdamW(model.parameters(), lr=5e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# 6. Training loop
model.train()
for epoch in range(3):
    total_loss = 0
    for batch in train_loader:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}
        
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch + 1} - Average Loss: {total_loss / len(train_loader):.4f}")

# Save the trained model
model.save_pretrained('intent_classifier_distilbert')
tokenizer.save_pretrained('intent_classifier_distilbert')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 - Average Loss: 0.3758
Epoch 2 - Average Loss: 0.0089
Epoch 3 - Average Loss: 0.0037


('intent_classifier_distilbert/tokenizer_config.json',
 'intent_classifier_distilbert/special_tokens_map.json',
 'intent_classifier_distilbert/vocab.txt',
 'intent_classifier_distilbert/added_tokens.json')

In [32]:
!pip install --upgrade spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [33]:
import spacy
from spacy.matcher import Matcher
import pandas as pd

# First install the model if needed
try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    print("Downloading language model...")
    from spacy.cli import download
    download('en_core_web_sm')
    nlp = spacy.load('en_core_web_sm')

# Define your product and color lists (example values)
PRODUCTS = ['iPhone', 'Samsung', 'MacBook', 'AirPods', 'PlayStation']
COLORS = ['black', 'white', 'blue', 'red', 'silver']

# Initialize matcher
matcher = Matcher(nlp.vocab)

# Define patterns for our entities
patterns = {
    'ORDER_ID': [
        [{'TEXT': {'REGEX': r'[A-Z]{2}\d{5}'}}],
        [{'TEXT': {'REGEX': r'#?\d{5,8}'}}]
    ],
    'PRODUCT': [
        [{'LOWER': {'IN': [p.lower() for p in PRODUCTS]}}]
    ],
    'COLOR': [
        [{'LOWER': {'IN': COLORS}}]
    ],
    'ERROR_CODE': [
        [{'TEXT': {'REGEX': r'\d{3}'}}],
        [{'TEXT': {'REGEX': r'ERR_[A-Z_]+'}}]
    ]
}

# Add patterns to matcher
for label, pattern in patterns.items():
    matcher.add(label, pattern)

def extract_entities(text):
    doc = nlp(text)
    matches = matcher(doc)
    entities = {}
    
    for match_id, start, end in matches:
        label = nlp.vocab.strings[match_id]
        span = doc[start:end]
        entities[label] = span.text
    
    return entities

# Test entity extraction
sample_text = "I'm getting error 404 when trying to checkout my order #AB1234 for the black iPhone"
print("\nEntity Extraction Example:")
print(f"Text: {sample_text}")
print(f"Entities: {extract_entities(sample_text)}")

# Example dataframe (replace with your actual dataframe)
df = pd.DataFrame({
    'text': [
        "Status of order #AB1234",
        "I want to return my black iPhone",
        "Error 500 when checking out"
    ]
})

# Add entity extraction to dataframe
df['predicted_entities'] = df['text'].apply(extract_entities)
print("\nDataFrame with extracted entities:")
print(df)


Entity Extraction Example:
Text: I'm getting error 404 when trying to checkout my order #AB1234 for the black iPhone
Entities: {'ERROR_CODE': 'AB1234', 'COLOR': 'black', 'PRODUCT': 'iPhone'}

DataFrame with extracted entities:
                               text                       predicted_entities
0           Status of order #AB1234                 {'ERROR_CODE': 'AB1234'}
1  I want to return my black iPhone  {'COLOR': 'black', 'PRODUCT': 'iPhone'}
2       Error 500 when checking out                    {'ERROR_CODE': '500'}


In [34]:
class CustomerSupportChatbot:
    def __init__(self):
        # Load intent classifier
        self.vectorizer = joblib.load('tfidf_vectorizer.joblib')
        self.intent_clf = joblib.load('intent_classifier.joblib')
        
        # Load entity recognizer (spaCy)
        self.nlp = spacy.load('en_core_web_sm')
        self.matcher = Matcher(self.nlp.vocab)
        for label, pattern in patterns.items():
            self.matcher.add(label, pattern)
    
    def predict_intent(self, text):
        # Vectorize text
        X = self.vectorizer.transform([text])
        # Predict intent
        return self.intent_clf.predict(X)[0]
    
    def extract_entities(self, text):
        doc = self.nlp(text)
        matches = self.matcher(doc)
        entities = {}
        
        for match_id, start, end in matches:
            label = self.nlp.vocab.strings[match_id]
            span = doc[start:end]
            entities[label] = span.text
        
        return entities
    
    def generate_response(self, intent, entities):
        # Simple response generation
        responses = {
            'order_status': [
                f"I've located your order {entities.get('ORDER_ID', '')}. It's currently being processed.",
                f"Your order {entities.get('ORDER_ID', '')} is out for delivery.",
                f"Order {entities.get('ORDER_ID', '')} was delivered yesterday."
            ],
            'product_inquiry': [
                f"The {entities.get('PRODUCT', 'product')} is currently in stock.",
                f"We expect more {entities.get('PRODUCT', 'product')} inventory next week.",
                f"The {entities.get('PRODUCT', 'product')} comes in {entities.get('COLOR', 'multiple')} colors."
            ],
            'refund_request': [
                f"I can help you return the {entities.get('PRODUCT', 'item')}.",
                f"Please package the {entities.get('PRODUCT', 'item')} for return shipping.",
                f"We'll process a refund for your {entities.get('PRODUCT', 'item')}."
            ],
            'account_help': [
                "I can help reset your password.",
                "Please check your email for a password reset link.",
                "Let me transfer you to account support."
            ],
            'technical_support': [
                f"I've noted the {entities.get('ERROR_CODE', 'error')} you're experiencing.",
                "Our technical team is working on this issue.",
                "Please try clearing your cache and cookies."
            ]
        }
        
        return random.choice(responses.get(intent, ["I'm sorry, I didn't understand that."]))
    
    def process_query(self, text):
        intent = self.predict_intent(text)
        entities = self.extract_entities(text)
        response = self.generate_response(intent, entities)
        
        return {
            'intent': intent,
            'entities': entities,
            'response': response
        }

# Initialize and test the chatbot
chatbot = CustomerSupportChatbot()

test_queries = [
    "What's the status of order #AB1234?",
    "Do you have the iPhone in black?",
    "I need to return my defective MacBook Pro",
    "I'm getting error 500 when trying to login"
]

print("\nChatbot Test:")
for query in test_queries:
    result = chatbot.process_query(query)
    print(f"\nQuery: {query}")
    print(f"Intent: {result['intent']}")
    print(f"Entities: {result['entities']}")
    print(f"Response: {result['response']}")


Chatbot Test:

Query: What's the status of order #AB1234?
Intent: order_status
Entities: {'ERROR_CODE': 'AB1234'}
Response: Order  was delivered yesterday.

Query: Do you have the iPhone in black?
Intent: product_inquiry
Entities: {'PRODUCT': 'iPhone', 'COLOR': 'black'}
Response: The iPhone is currently in stock.

Query: I need to return my defective MacBook Pro
Intent: refund_request
Entities: {'PRODUCT': 'MacBook'}
Response: I can help you return the MacBook.

Query: I'm getting error 500 when trying to login
Intent: technical_support
Entities: {'ERROR_CODE': '500'}
Response: I've noted the 500 you're experiencing.


In [40]:
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Load your data
df = pd.read_csv('synthetic_customer_support.csv')

# Check dataset size and adjust sample size accordingly
sample_size = min(50, len(df))  # Take 50 or whatever is available if less than 50
test_samples = df.sample(sample_size, random_state=42)  # Fixed random state for reproducibility

true_intents = []
pred_intents = []
correct_responses = 0

for _, row in test_samples.iterrows():
    try:
        result = chatbot.process_query(row['text'])
        true_intents.append(row['intent'])
        pred_intents.append(result['intent'])
        
        # Simple check if response seems appropriate
        if result['intent'] == row['intent']:
            correct_responses += 1
    except Exception as e:
        print(f"Error processing query: {row['text']}")
        print(f"Error: {str(e)}")
        continue

# Only calculate metrics if we have samples
if len(true_intents) > 0:
    # Calculate accuracy
    intent_accuracy = accuracy_score(true_intents, pred_intents)
    response_accuracy = correct_responses / len(test_samples)

    print(f"\nEvaluated on {len(true_intents)} samples")
    print("System Evaluation:")
    print(f"Intent Accuracy: {intent_accuracy:.2f}")
    print(f"Appropriate Response Rate: {response_accuracy:.2f}")
    print("\nIntent Classification Report:")
    print(classification_report(true_intents, pred_intents))
else:
    print("No samples were successfully processed for evaluation.")


Evaluated on 50 samples
System Evaluation:
Intent Accuracy: 1.00
Appropriate Response Rate: 1.00

Intent Classification Report:
                   precision    recall  f1-score   support

     account_help       1.00      1.00      1.00        10
     order_status       1.00      1.00      1.00        11
  product_inquiry       1.00      1.00      1.00         8
   refund_request       1.00      1.00      1.00         7
technical_support       1.00      1.00      1.00        14

         accuracy                           1.00        50
        macro avg       1.00      1.00      1.00        50
     weighted avg       1.00      1.00      1.00        50

