In [1]:
import pandas as pd

df = pd.read_csv("customer_support_tickets.csv")

# Further explore Ticket Type, Ticket Subject, Description 
After exploring the dataset to gain an initial understanding, this code delves into the ticket type, subject, and description.

In [11]:
df["Ticket Subject"].value_counts()

Ticket Subject
Refund request              576
Software bug                574
Product compatibility       567
Delivery problem            561
Hardware issue              547
Battery life                542
Network problem             539
Installation support        530
Product setup               529
Payment issue               526
Product recommendation      517
Account access              509
Peripheral compatibility    496
Data loss                   491
Cancellation request        487
Display issue               478
Name: count, dtype: int64

There are 16 different ticket subjects, evenly distributed. The content of the ticket description can be used to infer the customer's intent, which helps in categorizing the ticket subjects.

In [2]:
#Print out some example
for subject in df["Ticket Subject"].unique():
    descriptions = df[df["Ticket Subject"] == subject].head(5)["Ticket Description"]
    print(f"Some example for: {subject}")
    for d in descriptions:
        print("----------------------------------------------------------------------------------------")
        print(d)                
    print("============================================================================================")


Some example for: Product setup
----------------------------------------------------------------------------------------
I'm having an issue with the {product_purchased}. Please assist.

Your billing zip code is: 71701.

We appreciate that you have requested a website address.

Please double check your email address. I've tried troubleshooting steps mentioned in the user manual, but the issue persists.
----------------------------------------------------------------------------------------
I've recently set up my {product_purchased}, but it fails to connect to any available networks. What steps should I take to troubleshoot this issue?

I can't find the 'Product_IP' field of the I'm concerned about the security of my {product_purchased} and would like to ensure that my data is safe.
----------------------------------------------------------------------------------------
I'm having an issue with the {product_purchased}. Please assist.

Product Name: TPUBASK3E3KQ0


Join Date: Oct 2007 P

- The phrases "I'm having an issue with the {product_purchased}." and "Please assist." appear a lot, but carry little unique information.
- The ticket description contains non-text elements and potential code injections, which should be removed.

The process below lowers the text, removes special characters, lemmatizes words, and filters out non-informative phrases, yielding clean tokens for vocabulary construction and frequency counting. Stop words are retained to support a GenAI-based classifier.

In [12]:
import re
import nltk
import spacy
import torch
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from transformers import AutoTokenizer
from torchtext.vocab import build_vocab_from_iterator
from collections import Counter

In [13]:
nltk.download("stopwords")
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# Custom phrases to remove (case-insensitive, regex-safe)
phrases_to_remove = [
    r"i'?m having an issue with (the )?\{?product_purchased\}?\s*\.?",
    r"please assist\.?",
    r"let me know\.?",
    r"thank you\.?",
]
phrase_pattern = re.compile(r"|".join(phrases_to_remove), re.IGNORECASE)

# Clean text function
def clean_text(text):
    # Lowercase and remove custom phrases
    text = text.lower()
    text = phrase_pattern.sub("", text)
    
    # Remove special characters
    text = re.sub(r"[^a-z\s]", " ", text)
    
    # Lemmatize and remove stop words
    doc = nlp(text)
    tokens = [
        token.lemma_ for token in doc
        if token.lemma_.isalpha()
    ]
    
    return " ".join(tokens)
    
df["Ticket Content"] = df["Ticket Description"].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/trunght/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
#Print out some example
for subject in df["Ticket Subject"].unique():
    descriptions = df[df["Ticket Subject"] == subject].head(3)["Ticket Content"]
    print(f"Some example for: {subject}")
    for d in descriptions:
        print("----------------------------------------------------------------------------------------")
        print(d)                
    print("============================================================================================")


Some example for: Product setup
----------------------------------------------------------------------------------------
your billing zip code be we appreciate that you have request a website address please double check your email address I ve try troubleshoot step mention in the user manual but the issue persist
----------------------------------------------------------------------------------------
I ve recently set up my product purchase but it fail to connect to any available network what step should I take to troubleshoot this issue I can t find the product ip field of the I m concerned about the security of my product purchase and would like to ensure that my data be safe
----------------------------------------------------------------------------------------
product name tpubask e kq join date oct post quote I ve recently update the firmware of my product purchase and the issue start happen afterward could it be relate to the update
Some example for: Peripheral compatibility
---

In [15]:
#Get max length of Ticket Description
df["Ticket Content"].str.split().str.len().max()

66

In [23]:
# Use TF-IDF for n-grams (2 to 4) to determine the most import n-grams for each subject
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

grouped = df.groupby("Ticket Subject")["Ticket Content"].apply(lambda texts: " ".join(texts))

vectorizer = TfidfVectorizer(ngram_range=(2, 4), stop_words="english")
X = vectorizer.fit_transform(grouped)

tfidf_df = pd.DataFrame(X.toarray(), index=grouped.index, columns=vectorizer.get_feature_names_out())

top_ngrams = {}

for subject in tfidf_df.index:
    row = tfidf_df.loc[subject]
    top = row.sort_values(ascending=False).head(10)
    top_ngrams[subject] = list(top.items())

for subject, phrases in top_ngrams.items():
    print(f"\n-Ticket Subject: {subject}")
    for phrase, score in phrases:
        print(f"{phrase}: {score:.4f}")


-Ticket Subject: Account access
product purchase: 0.5989
ve notice: 0.0825
issue persist: 0.0765
software update: 0.0765
ve check: 0.0657
work fine: 0.0645
ve try: 0.0622
purchase issue: 0.0574
product purchase issue: 0.0574
error message: 0.0562

-Ticket Subject: Battery life
product purchase: 0.5947
ve notice: 0.0880
error message: 0.0835
issue persist: 0.0813
product purchase issue: 0.0724
purchase issue: 0.0724
ve try: 0.0702
resolve problem: 0.0590
purchase screen: 0.0535
product purchase screen: 0.0535

-Ticket Subject: Cancellation request
product purchase: 0.6138
issue persist: 0.0872
ve notice: 0.0763
ve try: 0.0702
product purchase issue: 0.0690
purchase issue: 0.0690
software update: 0.0605
work fine: 0.0569
ve check: 0.0521
resolve problem: 0.0508

-Ticket Subject: Data loss
product purchase: 0.5894
ve notice: 0.0837
issue persist: 0.0776
error message: 0.0728
software update: 0.0679
resolve problem: 0.0643
ve check: 0.0643
ve try: 0.0643
product purchase issue: 0.0619
pur

It seems hard to just based on top n-gram to determine the ticket subjects

In [30]:
#Use sentence embedding to determine typical tickets for each subject. 
#We first embed each sentence, get the centroid embedding by the mean of all sentence in each subject, 
#then choose top k similar sentence to the centroid

from transformers import AutoTokenizer, T5EncoderModel
from sklearn.metrics.pairwise import cosine_similarity

device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/sentence-t5-base")
model = T5EncoderModel.from_pretrained("sentence-transformers/sentence-t5-base").to(device)

def get_sentence_t5_embeddings(texts, device = "cuda"):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model(**inputs)
    embeddings = output.last_hidden_state.mean(dim=1)
    return embeddings

def top_n_typical_descriptions(df, n=3):
    result = {}
    for subject, group in df.groupby("Ticket Subject"):
        descriptions = group["Ticket Content"].tolist()
        embeddings = get_sentence_t5_embeddings(descriptions).numpy()
        centroid = embeddings.mean(axis=0).reshape(1, -1)
        sims = cosine_similarity(embeddings, centroid).flatten()
        top_indices = sims.argsort()[-n:][::-1]
        result[subject] = [descriptions[i] for i in top_indices]
    return result

In [31]:
top_n_typical_descriptions(df)

KeyboardInterrupt: 

In [21]:
# class TicketDataset(Dataset):
#     def __init__(self, dataframe, tokenizer, max_length=512):
#         self.data = dataframe
#         self.tokenizer = tokenizer
        
#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         row = self.data.iloc[idx]
#         ticket_subject = str(row['Ticket Subject'])
#         ticket_description = str(row['Ticket Description'])
#         ticket_type = str(row['Ticket Type'])

#         return ticket_type, ticket_subject, ticket_description        

# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# dataset = TicketDataset(df, tokenizer)


In [None]:
# token_freq = Counter()

# # Token generator for vocab
# def yield_tokens(data_iter):
#     for _, _, text in data_iter:
#         tokens = clean_and_tokenize(text)
#         token_freq.update(tokens)
#         yield tokens

# # Build vocab
# vocab = build_vocab_from_iterator(yield_tokens(dataset), specials=["<unk>", "<pad>"])
# vocab.set_default_index(vocab["<unk>"])


In [None]:
print(f"Vocabulary size: {len(vocab)}")
print("Top 10 frequent tokens:")
for token, freq in token_freq.most_common(10):
    print(f"{token}: {freq}")
