In [1]:
import pandas as pd

df = pd.read_csv("customer_support_tickets.csv")

# Further explore Ticket Type, Ticket Subject, Description 
After exploring the dataset to gain an initial understanding, this code delves into the ticket type, subject, and description.

In [11]:
df["Ticket Subject"].value_counts()

Ticket Subject
Refund request              576
Software bug                574
Product compatibility       567
Delivery problem            561
Hardware issue              547
Battery life                542
Network problem             539
Installation support        530
Product setup               529
Payment issue               526
Product recommendation      517
Account access              509
Peripheral compatibility    496
Data loss                   491
Cancellation request        487
Display issue               478
Name: count, dtype: int64

There are 16 different ticket subjects, evenly distributed. The content of the ticket description can be used to infer the customer's intent, which helps in categorizing the ticket subjects.

In [2]:
#Print out some example
for subject in df["Ticket Subject"].unique():
    descriptions = df[df["Ticket Subject"] == subject].head(5)["Ticket Description"]
    print(f"Some example for: {subject}")
    for d in descriptions:
        print("----------------------------------------------------------------------------------------")
        print(d)                
    print("============================================================================================")


Some example for: Product setup
----------------------------------------------------------------------------------------
I'm having an issue with the {product_purchased}. Please assist.

Your billing zip code is: 71701.

We appreciate that you have requested a website address.

Please double check your email address. I've tried troubleshooting steps mentioned in the user manual, but the issue persists.
----------------------------------------------------------------------------------------
I've recently set up my {product_purchased}, but it fails to connect to any available networks. What steps should I take to troubleshoot this issue?

I can't find the 'Product_IP' field of the I'm concerned about the security of my {product_purchased} and would like to ensure that my data is safe.
----------------------------------------------------------------------------------------
I'm having an issue with the {product_purchased}. Please assist.

Product Name: TPUBASK3E3KQ0


Join Date: Oct 2007 P

- The phrases "I'm having an issue with the {product_purchased}." and "Please assist." appear a lot, but carry little unique information.
- The ticket description contains non-text elements and potential code injections, which should be removed.

The process below lowers the text, removes special characters, lemmatizes words, and filters out non-informative phrases, yielding clean tokens for vocabulary construction and frequency counting. Stop words are retained to support a GenAI-based classifier.

In [12]:
import re
import nltk
import spacy
import torch
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from transformers import AutoTokenizer
from torchtext.vocab import build_vocab_from_iterator
from collections import Counter

In [43]:
# Custom phrases to remove (case-insensitive, regex-safe)
phrases_to_remove = [
    r"i'?m having an issue with (the )?\{?product_purchased\}?\s*\.?",
    r"please assist\.?",
    r"let me know\.?",
    r"thank you\.?",
]
phrase_pattern = re.compile(r"|".join(phrases_to_remove), re.IGNORECASE)

def clean_text(text):
    text = text.lower()
    text = phrase_pattern.sub("", text)
    text = re.sub(r"\s+", " ", text)

    return text
    
df["Ticket Content"] = df["Ticket Description"].apply(clean_text)

In [44]:
#Print out some example
for subject in df["Ticket Subject"].unique():
    descriptions = df[df["Ticket Subject"] == subject].head(3)["Ticket Content"]
    print(f"Some example for: {subject}")
    for d in descriptions:
        print("----------------------------------------------------------------------------------------")
        print(d)                
    print("============================================================================================")


Some example for: Product setup
----------------------------------------------------------------------------------------
 your billing zip code is: 71701. we appreciate that you have requested a website address. please double check your email address. i've tried troubleshooting steps mentioned in the user manual, but the issue persists.
----------------------------------------------------------------------------------------
i've recently set up my {product_purchased}, but it fails to connect to any available networks. what steps should i take to troubleshoot this issue? i can't find the 'product_ip' field of the i'm concerned about the security of my {product_purchased} and would like to ensure that my data is safe.
----------------------------------------------------------------------------------------
 product name: tpubask3e3kq0 join date: oct 2007 posts: 11,532 quote: i've recently updated the firmware of my {product_purchased}, and the issue started happening afterward. could it b

In [45]:
#Get max length of Ticket Description
df["Ticket Content"].str.split().str.len().max()

63

In [46]:
# Use TF-IDF for n-grams (2 to 4) to determine the most import n-grams for each subject
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

grouped = df.groupby("Ticket Subject")["Ticket Content"].apply(lambda texts: " ".join(texts))

vectorizer = TfidfVectorizer(ngram_range=(2, 4), stop_words="english")
X = vectorizer.fit_transform(grouped)

tfidf_df = pd.DataFrame(X.toarray(), index=grouped.index, columns=vectorizer.get_feature_names_out())

top_ngrams = {}

for subject in tfidf_df.index:
    row = tfidf_df.loc[subject]
    top = row.sort_values(ascending=False).head(10)
    top_ngrams[subject] = list(top.items())

for subject, phrases in top_ngrams.items():
    print(f"\n-Ticket Subject: {subject}")
    for phrase, score in phrases:
        print(f"{phrase}: {score:.4f}")


-Ticket Subject: Account access
ve noticed: 0.1057
issue persists: 0.0980
ve checked: 0.0842
ve tried: 0.0796
product_purchased issue: 0.0720
resolve problem: 0.0628
product_purchased account: 0.0612
works fine: 0.0612
acts unexpectedly: 0.0597
facing intermittent works: 0.0597

-Ticket Subject: Battery life
ve noticed: 0.1121
issue persists: 0.1036
product_purchased issue: 0.0923
ve tried: 0.0894
resolve problem: 0.0752
product_purchased screen: 0.0681
ve recently: 0.0653
troubleshooting steps: 0.0639
customer support: 0.0582
error message: 0.0554

-Ticket Subject: Cancellation request
issue persists: 0.1134
ve noticed: 0.0993
ve tried: 0.0914
product_purchased issue: 0.0867
ve checked: 0.0677
resolve problem: 0.0662
troubleshooting steps: 0.0583
ve recently: 0.0551
times acts unexpectedly: 0.0536
fine times: 0.0536

-Ticket Subject: Data loss
ve noticed: 0.1057
issue persists: 0.0980
resolve problem: 0.0812
ve tried: 0.0812
ve checked: 0.0812
product_purchased issue: 0.0766
product_

It seems hard to just based on top n-gram to determine the ticket subjects

In [47]:
#Use sentence embedding to determine typical tickets for each subject. 
#We first embed each sentence, get the centroid embedding by the mean of all sentence in each subject, 
#then choose top k similar sentence to the centroid

from transformers import AutoTokenizer, T5EncoderModel
from sklearn.metrics.pairwise import cosine_similarity

device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/sentence-t5-base")
model = T5EncoderModel.from_pretrained("sentence-transformers/sentence-t5-base").to(device)

def get_sentence_t5_embeddings(texts, device = "cuda"):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model(**inputs)
    embeddings = output.last_hidden_state.mean(dim=1)
    return embeddings

def top_n_typical_descriptions(df, n=3, device = "cuda"):
    result = {}
    for subject, group in df.groupby("Ticket Subject"):
        print(f"Processing for: {subject}")
        descriptions = group["Ticket Content"].tolist()
        embeddings = get_sentence_t5_embeddings(descriptions, device = device).detach().cpu().numpy()
        centroid = embeddings.mean(axis=0).reshape(1, -1)
        sims = cosine_similarity(embeddings, centroid).flatten()
        top_indices = sims.argsort()[-n:][::-1]
        result[subject] = [descriptions[i] for i in top_indices]
    return result

In [48]:
result = top_n_typical_descriptions(df)

Processing for: Account access
Processing for: Battery life
Processing for: Cancellation request
Processing for: Data loss
Processing for: Delivery problem
Processing for: Display issue
Processing for: Hardware issue
Processing for: Installation support
Processing for: Network problem
Processing for: Payment issue
Processing for: Peripheral compatibility
Processing for: Product compatibility
Processing for: Product recommendation
Processing for: Product setup
Processing for: Refund request
Processing for: Software bug


In [56]:
for key, values in result.items():
    print(f"Typical tickets for {key}")
    for value in values:
        print(f"- {value}")
    print("-" * 40)


Typical tickets for Account access
-  i'm trying to find a way to reset it to the {product_purchased}. it may be possible for another vendor to issue an error that might cause i'm not sure if this issue is specific to my device or if others have reported similar problems.
-  i'm having an issue with the {{product_purchased}} in my account, please help me fix this issue no matter what i do i've tried different settings and configurations on my {product_purchased}, but the issue persists.
-  i'm also having an issue with the {product_purchased}. please help us fix this. this is a big issue since we i'm concerned about the security of my {product_purchased} and would like to ensure that my data is safe.
----------------------------------------
Typical tickets for Battery life
-  i have noticed a problem with the {product_purchased}. i have been buying through the {product_featured_ i'm not sure if this issue is specific to my device or if others have reported similar problems.
- my {produ

In [21]:
# class TicketDataset(Dataset):
#     def __init__(self, dataframe, tokenizer, max_length=512):
#         self.data = dataframe
#         self.tokenizer = tokenizer
        
#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         row = self.data.iloc[idx]
#         ticket_subject = str(row['Ticket Subject'])
#         ticket_description = str(row['Ticket Description'])
#         ticket_type = str(row['Ticket Type'])

#         return ticket_type, ticket_subject, ticket_description        

# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# dataset = TicketDataset(df, tokenizer)


In [None]:
# token_freq = Counter()

# # Token generator for vocab
# def yield_tokens(data_iter):
#     for _, _, text in data_iter:
#         tokens = clean_and_tokenize(text)
#         token_freq.update(tokens)
#         yield tokens

# # Build vocab
# vocab = build_vocab_from_iterator(yield_tokens(dataset), specials=["<unk>", "<pad>"])
# vocab.set_default_index(vocab["<unk>"])


In [None]:
print(f"Vocabulary size: {len(vocab)}")
print("Top 10 frequent tokens:")
for token, freq in token_freq.most_common(10):
    print(f"{token}: {freq}")
