In [12]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [13]:
data=pd.read_csv("cleaned_tickets_ready.csv")
data.head()

Unnamed: 0,ticket_id,ticket_text,issue_type,urgency_level,product,clean_text
0,1,Payment issue for my SmartWatch V2. I was unde...,Billing Problem,Medium,SmartWatch V2,pay issue my smartwatch v2 i underbill order 2...
1,2,Can you tell me more about the UltraClean Vacu...,General Inquiry,,UltraClean Vacuum,can you tell me more about ultraclean vacuum w...
2,3,I ordered SoundWave 300 but got EcoBreeze AC i...,Wrong Item,Medium,SoundWave 300,i order soundwave 300 but got ecobreeze ac ins...
3,4,Facing installation issue with PhotoSnap Cam. ...,Installation Issue,Low,PhotoSnap Cam,fac installation issue photosnap cam setup fai...
4,5,Order #30903 for Vision LED TV is 13 days late...,Late Delivery,,Vision LED TV,order 30903 vision led tv 13 day late order 03...


In [14]:
data.tail()

Unnamed: 0,ticket_id,ticket_text,issue_type,urgency_level,product,clean_text
995,996,I ordered EcoBreeze AC but got FitRun Treadmil...,Wrong Item,High,EcoBreeze AC,i order ecobreeze ac but got fitrun treadmill ...
996,997,I ordered SoundWave 300 but got PowerMax Batte...,Wrong Item,Low,SoundWave 300,i order soundwave 300 but got powermax battery...
997,998,,Installation Issue,Medium,EcoBreeze AC,
998,999,Payment issue fr mi SoundWave 300. I was debit...,Billing Problem,Low,SoundWave 300,pay issue fr mi soundwave 300 i debit incorrec...
999,1000,PowerMax Battery is missing. It stopped workin...,Product Defect,Low,PowerMax Battery,powermax battery miss stopp work after just 1 day


In [15]:
print(data.columns)

Index(['ticket_id', 'ticket_text', 'issue_type', 'urgency_level', 'product',
       'clean_text'],
      dtype='object')


In [16]:
data_filtered = data.dropna(subset=['clean_text', 'issue_type', 'urgency_level'])

In [18]:
#FEATURE ENGINEERING(TF-IDF)
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(data_filtered['clean_text'])

# Ticket length
data_filtered.loc[:,'ticket_length'] = data_filtered['clean_text'].apply(lambda x: len(x.split()))
X_length = data_filtered['ticket_length'].values.reshape(-1, 1)

# Combine features
X_combined = hstack([X_tfidf, X_length])

# Labels
y_issue_type = data_filtered['issue_type']
y_urgency_level = data_filtered['urgency_level']

print("TF-IDF shape:", X_tfidf.shape)
print("Combined shape:", X_combined.shape)
print("Labels:", y_issue_type.shape, y_urgency_level.shape)

TF-IDF shape: (826, 464)
Combined shape: (826, 465)
Labels: (826,) (826,)


In [55]:
# MULTI_TASK CLASSIFICATION
# Train model
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_combined, y_issue_type, test_size=0.2, random_state=42)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_combined, y_urgency_level, test_size=0.2, random_state=42)

# (LOGISTIC REGRESSION MODELS)Train Issue Type Classifier
model_issue = LogisticRegression(max_iter=1000)
model_issue.fit(X_train_1, y_train_1)
model_urgency = LogisticRegression(max_iter=1000)
model_urgency.fit(X_train_2, y_train_2)

# Issue Type Classifier
pred_issue = model_issue.predict(X_test_1)
print("\nIssue Type Classifier Results--")
print("Accuracy:", accuracy_score(y_test_1, pred_issue))
print("Confusion Matrix:\n", confusion_matrix(y_test_1, pred_issue))
print("\nClassification Report:\n", classification_report(y_test_1, pred_issue))

# Urgency Level Classifier
pred_urgency = model_urgency.predict(X_test_2)
print("\nUrgency Level Classifier Results")
print("Accuracy:", accuracy_score(y_test_2, pred_urgency))
print("Confusion Matrix:\n", confusion_matrix(y_test_2, pred_urgency))
print("\nClassification Report:\n", classification_report(y_test_2, pred_urgency))



Issue Type Classifier Results--
Accuracy: 1.0
Confusion Matrix:
 [[23  0  0  0  0  0  0]
 [ 0 19  0  0  0  0  0]
 [ 0  0 25  0  0  0  0]
 [ 0  0  0 29  0  0  0]
 [ 0  0  0  0 17  0  0]
 [ 0  0  0  0  0 30  0]
 [ 0  0  0  0  0  0 23]]

Classification Report:
                     precision    recall  f1-score   support

    Account Access       1.00      1.00      1.00        23
   Billing Problem       1.00      1.00      1.00        19
   General Inquiry       1.00      1.00      1.00        25
Installation Issue       1.00      1.00      1.00        29
     Late Delivery       1.00      1.00      1.00        17
    Product Defect       1.00      1.00      1.00        30
        Wrong Item       1.00      1.00      1.00        23

          accuracy                           1.00       166
         macro avg       1.00      1.00      1.00       166
      weighted avg       1.00      1.00      1.00       166


Urgency Level Classifier Results
Accuracy: 0.3493975903614458
Confusion Matr

In [57]:
# ENTITY EXTRACTION
import re
# complaint keywords
complaint_keywords = ['broken', 'error', 'late', 'not working', 'damaged', 'failed', 'missing', 'defective']
date_patterns = [
    r'\b\d{1,2}/\d{1,2}/\d{2,4}\b',      
    r'\b\d{1,2}-\d{1,2}-\d{2,4}\b',       
    r'\b(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\s\d{1,2}\b',
]

def extract_entities(text, product_list):
    text_lower = text.lower()
    entities = {
        "products": [],
        "dates": [],
        "complaints": []
    }

    for prod in product_list:
        if isinstance(prod, str) and prod.lower() in text_lower:
            entities["products"].append(prod)

    for pattern in date_patterns:
        matches = re.findall(pattern, text_lower)
        entities["dates"].extend(matches)

    for word in complaint_keywords:
        if word in text_lower:
            entities["complaints"].append(word)

    return entities


sample_row = data_filtered.iloc[0] 
ticket_text = sample_row['ticket_text']

product_list = data_filtered['product'].dropna().unique()

entities = extract_entities(ticket_text, product_list)

print("Ticket Text:")
print(ticket_text)
print("\nExtracted Entities:")
print(entities)


Ticket Text:
Payment issue for my SmartWatch V2. I was underbilled for order #29224.

Extracted Entities:
{'products': ['SmartWatch V2'], 'dates': [], 'complaints': []}


In [62]:
# INTEGRATION FUNCTION
# Basic stopword list for manual cleaning
basic_stopwords = {
    'i', 'am', 'is', 'the', 'a', 'an', 'to', 'in', 'on', 'for', 'with', 'it', 'this', 'that',
    'and', 'or', 'of', 'was', 'were', 'are', 'we', 'you', 'but', 'not', 'my', 'so', 'be', 'have'
}

product_list = data_filtered['product'].dropna().unique()

def process_ticket(text):
    text_clean = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
    text_clean = ' '.join(word for word in text_clean.split() if word not in basic_stopwords)
    
    text_vec = tfidf.transform([text_clean])
    
    ticket_len = len(text_clean.split())
    X_len = np.array([[ticket_len]])

    X_input = hstack([text_vec, X_len])

    pred_issue = model_issue.predict(X_input)[0]
    pred_urgency = model_urgency.predict(X_input)[0]

    entities = extract_entities(text, product_list)

    return {
        "predicted_issue_type": pred_issue,
        "predicted_urgency_level": pred_urgency,
        "extracted_entities": entities
    }


ticket_input = "The customer received a defective AirCooler Pro on March 5. Please replace it immediately."

result = process_ticket(ticket_input)

print("Final Output:")
print(result)


Final Output:
{'predicted_issue_type': 'Late Delivery', 'predicted_urgency_level': 'Medium', 'extracted_entities': {'products': [], 'dates': ['march 5'], 'complaints': ['defective']}}
