In [7]:
# Install required packages (for Colab or clean environment)
!pip install pandas scikit-learn torch transformers joblib numpy --quiet


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import torch
from transformers import DistilBertTokenizer, DistilBertModel
import joblib
import numpy as np

# Load dataset
df = pd.read_csv("/content/customer_support_tickets.csv")  # replace with Kaggle CSV

# For demo, sample 5000 rows
df = df.sample(5000, random_state=42)

# Combine subject + description as input text
df['TicketText'] = df['Ticket Subject'].fillna('') + ' ' + df['Ticket Description'].fillna('')

# Encode target
le = LabelEncoder()
df['PriorityLabel'] = le.fit_transform(df['Ticket Priority'])  # High/Medium/Low -> 0/1/2

# Split
X_train, X_test, y_train, y_test = train_test_split(df['TicketText'], df['PriorityLabel'], test_size=0.2, random_state=42)

# Load DistilBERT model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

def embed_texts(texts, tokenizer, model, max_length=128):
    """Convert list of texts to embeddings using DistilBERT CLS token."""
    embeddings = []
    batch_size = 16
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        encoded = tokenizer(batch, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
        with torch.no_grad():
            outputs = model(**encoded)
            cls_embeddings = outputs.last_hidden_state[:,0,:]  # CLS token
            embeddings.append(cls_embeddings)
    return torch.cat(embeddings).numpy()

# Embed tickets
print("Embedding training tickets with DistilBERT...")
X_train_emb = embed_texts(X_train.tolist(), tokenizer, model)
print("Embedding test tickets with DistilBERT...")
X_test_emb = embed_texts(X_test.tolist(), tokenizer, model)

# Train classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_emb, y_train)

# Save models and tokenizer
joblib.dump(clf, "ticket_priority_clf.joblib")
joblib.dump(le, "label_encoder.joblib")
joblib.dump(tokenizer, "distilbert_tokenizer.joblib")
joblib.dump(model, "distilbert_model.joblib")

print("✅ Models saved successfully!")


Embedding training tickets with DistilBERT...
Embedding test tickets with DistilBERT...
✅ Models saved successfully!


In [1]:
import pandas as pd

# Create sample ticket data
data = [
    ["Login Issue", "Customer cannot log into the account since yesterday. Tried resetting password but it didn't work."],
    ["Payment Failed", "Payment failed multiple times during checkout. Needs urgent resolution."],
    ["Product Defect", "The delivered product is defective and not working as expected. Wants replacement."],
    ["Account Suspension", "Customer account was suspended without notification. Wants clarification."],
    ["Shipping Delay", "Order not received yet. Expected delivery was 5 days ago. Needs update."],
    ["Refund Request", "Customer requested a refund for the last order but hasn't received confirmation."],
    ["Feature Request", "Customer wants a dark mode feature in the mobile app for better usability."],
    ["App Crash", "App crashes when opening the profile page. Happens on both iOS and Android."],
    ["Wrong Billing", "Customer was charged twice for the same order. Needs correction immediately."],
    ["Password Reset", "Customer forgot password and password reset email not received."]
]

# Build DataFrame
df = pd.DataFrame(data, columns=["Ticket Subject", "Ticket Description"])
df['TicketText'] = df['Ticket Subject'] + ' ' + df['Ticket Description']

# Save CSV
df.to_csv("sample_test.csv", index=False)
print("✅ sample_test.csv created successfully!")

# Optional: for Colab, provide download link
from google.colab import files
files.download("sample_test.csv")


✅ sample_test.csv created successfully!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
import pandas as pd

df = pd.read_csv("/content/customer_support_tickets.csv")
print(df.columns)


Index(['Ticket ID', 'Customer Name', 'Customer Email', 'Customer Age',
       'Customer Gender', 'Product Purchased', 'Date of Purchase',
       'Ticket Type', 'Ticket Subject', 'Ticket Description', 'Ticket Status',
       'Resolution', 'Ticket Priority', 'Ticket Channel',
       'First Response Time', 'Time to Resolution',
       'Customer Satisfaction Rating'],
      dtype='object')
