In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("data/tickets.csv", encoding="latin1")
df.head()

Unnamed: 0,Ticket ID,Customer Name,Customer Email,Customer Age,Customer Gender,Product Purchased,Date of Purchase,Ticket Type,Ticket Subject,Ticket Description,Ticket Status,Resolution,Ticket Priority,Ticket Channel,First Response Time,Time to Resolution,Customer Satisfaction Rating
0,1,Marisa Obrien,carrollallison@example.com,32,Other,GoPro Hero,2021-03-22,Technical issue,Product setup,I'm having an issue with the {product_purchase...,Pending Customer Response,,Critical,Social media,2023-06-01 12:15:36,,
1,2,Jessica Rios,clarkeashley@example.com,42,Female,LG Smart TV,2021-05-22,Technical issue,Peripheral compatibility,I'm having an issue with the {product_purchase...,Pending Customer Response,,Critical,Chat,2023-06-01 16:45:38,,
2,3,Christopher Robbins,gonzalestracy@example.com,48,Other,Dell XPS,2020-07-14,Technical issue,Network problem,I'm facing a problem with my {product_purchase...,Closed,Case maybe show recently my computer follow.,Low,Social media,2023-06-01 11:14:38,2023-06-01 18:05:38,3.0
3,4,Christina Dillon,bradleyolson@example.org,27,Female,Microsoft Office,2020-11-13,Billing inquiry,Account access,I'm having an issue with the {product_purchase...,Closed,Try capital clearly never color toward story.,Low,Social media,2023-06-01 07:29:40,2023-06-01 01:57:40,3.0
4,5,Alexander Carroll,bradleymark@example.com,67,Female,Autodesk AutoCAD,2020-02-04,Billing inquiry,Data loss,I'm having an issue with the {product_purchase...,Closed,West decision evidence bit.,Low,Email,2023-06-01 00:12:42,2023-06-01 19:53:42,1.0


In [3]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    return text

In [4]:
df['clean_text'] = df['Ticket Description'].apply(clean_text)
df[['Ticket Description', 'clean_text']].head()

Unnamed: 0,Ticket Description,clean_text
0,I'm having an issue with the {product_purchase...,im having an issue with the productpurchased p...
1,I'm having an issue with the {product_purchase...,im having an issue with the productpurchased p...
2,I'm facing a problem with my {product_purchase...,im facing a problem with my productpurchased t...
3,I'm having an issue with the {product_purchase...,im having an issue with the productpurchased p...
4,I'm having an issue with the {product_purchase...,im having an issue with the productpurchased p...


In [5]:
X = df['clean_text']
y_category = df['Ticket Type']

In [6]:
y_priority = df['Ticket Priority']

In [7]:
X_train, X_test, y_cat_train, y_cat_test = train_test_split(
    X, y_category, test_size=0.2, random_state=42
)

_, _, y_pri_train, y_pri_test = train_test_split(
    X, y_priority, test_size=0.2, random_state=42
)

In [8]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [9]:
category_model = LogisticRegression(max_iter=1000)
category_model.fit(X_train_tfidf, y_cat_train)

y_cat_pred = category_model.predict(X_test_tfidf)

In [10]:
print("Category Classification Accuracy:",
      accuracy_score(y_cat_test, y_cat_pred))

print("\nClassification Report:\n",
      classification_report(y_cat_test, y_cat_pred))

Category Classification Accuracy: 0.20070838252656434

Classification Report:
                       precision    recall  f1-score   support

     Billing inquiry       0.19      0.13      0.15       357
Cancellation request       0.19      0.19      0.19       327
     Product inquiry       0.20      0.20      0.20       316
      Refund request       0.21      0.24      0.22       345
     Technical issue       0.22      0.24      0.23       349

            accuracy                           0.20      1694
           macro avg       0.20      0.20      0.20      1694
        weighted avg       0.20      0.20      0.20      1694



In [11]:
priority_model = LogisticRegression(max_iter=1000)
priority_model.fit(X_train_tfidf, y_pri_train)

y_pri_pred = priority_model.predict(X_test_tfidf)

In [12]:
print("Priority Prediction Accuracy:",
      accuracy_score(y_pri_test, y_pri_pred))

print("\nPriority Classification Report:\n",
      classification_report(y_pri_test, y_pri_pred))

Priority Prediction Accuracy: 0.2680047225501771

Priority Classification Report:
               precision    recall  f1-score   support

    Critical       0.26      0.27      0.26       411
        High       0.27      0.29      0.28       409
         Low       0.25      0.24      0.25       415
      Medium       0.29      0.27      0.28       459

    accuracy                           0.27      1694
   macro avg       0.27      0.27      0.27      1694
weighted avg       0.27      0.27      0.27      1694

