In [1]:
import os
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [4]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tharani\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tharani\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# Base model for subject 
stop_words = set(stopwords.words("english"))

file_path = os.path.abspath("Data/aa_dataset-tickets-multi-lang-5-2-50-version.csv")

filtered_df = pd.read_csv(file_path)

filtered_df = filtered_df[filtered_df["language"] == "en"]

filtered_df = filtered_df.dropna(subset=["subject"])


def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

filtered_df["processed_subject"] = filtered_df["subject"].apply(preprocess)

keyword_mapping = {
    "Technical Support": ["error", "issue", "bug", "troubleshoot"],
    "Product Support": ["feature", "update", "compatibility"],
    "Returns and Exchanges": ["return", "exchange", "refund"],
    "Billing and Payments": ["invoice", "payment", "billing", "charge"],
    "Sales and Pre-Sales": ["pricing", "quote", "purchase", "buy"],
    "General Inquiry": ["question", "information", "ask"],
    "IT Support": ["server", "network", "login", "access"],
    "Service Outages and Maintenance": ["down", "maintenance", "outage"],
    "Human Resources": ["job", "application", "vacancy"],
    "Customer Service": ["help", "support", "contact"]
}

def categorize_ticket(tokens):
    for category, keywords in keyword_mapping.items():
        if any(word in tokens for word in keywords):
            return category
    return " "


filtered_df["subcategory"] = filtered_df["processed_subject"].apply(categorize_ticket)

category_mapping = {
    "Customer Support": ["Technical Support", "Product Support", "Returns and Exchanges", 
                         "Billing and Payments", "Sales and Pre-Sales", "General Inquiry"],
    "IT & Infrastructure": ["IT Support", "Service Outages and Maintenance"],
    "Internal Operations": ["Human Resources", "Customer Service"]
}

def get_category(subcategory):
    for category, subcategories in category_mapping.items():
        if subcategory in subcategories:
            return category
    return " "

filtered_df["category"] = filtered_df["subcategory"].apply(get_category)
print(filtered_df[["subject", "category", "subcategory"]])


                                                 subject          category  \
1                                     Account Disruption                     
2      Query About Smart Home System Integration Feat...                     
3                      Inquiry Regarding Invoice Details  Customer Support   
4      Question About Marketing Agency Software Compa...  Customer Support   
5                                          Feature Query  Customer Support   
...                                                  ...               ...   
28578                    Problem with Billing Adjustment  Customer Support   
28580  Urgent: Incident Involving Data Breach in Medi...                     
28582       Performance Problem with Data Analytics Tool                     
28585  Update Request for SaaS Platform Integration F...  Customer Support   
28586          Inquiry About Project Management Features                     

                subcategory  
1                            
2  

In [8]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(filtered_df["queue"], filtered_df["category"])
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 0.00%


In [None]:
# Base model for subject 
stop_words = set(stopwords.words("english"))

file_path = os.path.abspath("Data/aa_dataset-tickets-multi-lang-5-2-50-version.csv")

filtered_df = pd.read_csv(file_path)

filtered_df = filtered_df[filtered_df["language"] == "en"]

filtered_df = filtered_df.dropna(subset=["body"])

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

filtered_df["processed_body"] = filtered_df["body"].apply(preprocess)

keyword_mapping = {
    "Technical Support": ["error", "issue", "bug", "troubleshoot"],
    "Product Support": ["feature", "update", "compatibility"],
    "Returns and Exchanges": ["return", "exchange", "refund"],
    "Billing and Payments": ["invoice", "payment", "billing", "charge"],
    "Sales and Pre-Sales": ["pricing", "quote", "purchase", "buy"],
    "General Inquiry": ["question", "information", "ask"],
    "IT Support": ["server", "network", "login", "access"],
    "Service Outages and Maintenance": ["down", "maintenance", "outage"],
    "Human Resources": ["job", "application", "vacancy"],
    "Customer Service": ["help", "support", "contact"]
}

def categorize_ticket(tokens):
    for category, keywords in keyword_mapping.items():
        if any(word in tokens for word in keywords):
            return category
    return " "

filtered_df["subcategory"] = filtered_df["processed_body"].apply(categorize_ticket)

category_mapping = {
    "Customer Support": ["Technical Support", "Product Support", "Returns and Exchanges", 
                         "Billing and Payments", "Sales and Pre-Sales", "General Inquiry"],
    "IT & Infrastructure": ["IT Support", "Service Outages and Maintenance"],
    "Internal Operations": ["Human Resources", "Customer Service"]
}

def get_category(subcategory):
    for category, subcategories in category_mapping.items():
        if subcategory in subcategories:
            return category
    return " "

filtered_df["category"] = filtered_df["subcategory"].apply(get_category)

print(filtered_df[["body", "category", "subcategory"]])



Ticket Routing with Categories & Subcategories:
                                                    body             category  \
1      Dear Customer Support Team,\n\nI am writing to...     Customer Support   
2      Dear Customer Support Team,\n\nI hope this mes...     Customer Support   
3      Dear Customer Support Team,\n\nI hope this mes...     Customer Support   
4      Dear Support Team,\n\nI hope this message reac...     Customer Support   
5      Dear Customer Support,\n\nI hope this message ...  Internal Operations   
...                                                  ...                  ...   
28578  An unexpected billing discrepancy has been not...     Customer Support   
28580  A data breach has occurred, which might be rel...                        
28582  The data analytics tool experiences sluggish p...                        
28585  Requesting an update on the integration featur...     Customer Support   
28586  Looking for detailed information on the projec...    

In [None]:

# Sample DataFrame with subjects
filtered_df = pd.DataFrame({'subject': ["Technical issue with my laptop", 
                                        "Billing problem for last month's subscription", 
                                        "Question about product warranty"]})

# Preprocessing function
def preprocess(text):
    tokens = word_tokenize(text.lower())  
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]  
    return tokens

# Apply preprocessing
filtered_df['processed_subject'] = filtered_df['subject'].apply(preprocess)

# Ensure processed_subject contains lists
dictionary = corpora.Dictionary(filtered_df['processed_subject'])
corpus = [dictionary.doc2bow(text) for text in filtered_df['processed_subject']]

# Train LDA Model
lda_model = LdaModel(corpus, num_topics=3, id2word=dictionary, passes=10)

# Display Topics
print("\nIdentified Topics:")
for idx, topic in lda_model.show_topics(formatted=True):
    print(f"Topic {idx}: {topic}")
