# Task 1: Customer Support Ticket Classifier

✅ Step 1: Data Preparation

In [2]:
# Install Required Libraries
# !pip install pandas openpyxl nltk scikit-learn
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt_tab')

In [3]:
# !pip install xlrd


In [7]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [8]:

# Load the file (adjust filename if needed)
import pandas as pd

df = pd.read_excel(r'C:\Users\Saikiran\python1\company tasks\Vijayi Technologies\customerticket\ai_dev_assignment_tickets_complex_1000.xls', engine="xlrd")
df.head()


Unnamed: 0,ticket_id,ticket_text,issue_type,urgency_level,product
0,1,Payment issue for my SmartWatch V2. I was unde...,Billing Problem,Medium,SmartWatch V2
1,2,Can you tell me more about the UltraClean Vacu...,General Inquiry,,UltraClean Vacuum
2,3,I ordered SoundWave 300 but got EcoBreeze AC i...,Wrong Item,Medium,SoundWave 300
3,4,Facing installation issue with PhotoSnap Cam. ...,Installation Issue,Low,PhotoSnap Cam
4,5,Order #30903 for Vision LED TV is 13 days late...,Late Delivery,,Vision LED TV


In [9]:
df.info()
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ticket_id      1000 non-null   int64 
 1   ticket_text    945 non-null    object
 2   issue_type     924 non-null    object
 3   urgency_level  948 non-null    object
 4   product        1000 non-null   object
dtypes: int64(1), object(4)
memory usage: 39.2+ KB


ticket_id         0
ticket_text      55
issue_type       76
urgency_level    52
product           0
dtype: int64

In [10]:
# Preprocessing Function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords & lemmatize
    cleaned = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(cleaned)


In [11]:
# Apply Preprocessing
df['clean_text'] = df['ticket_text'].astype(str).apply(preprocess_text)
df[['ticket_text', 'clean_text']].head()

Unnamed: 0,ticket_text,clean_text
0,Payment issue for my SmartWatch V2. I was unde...,payment issue smartwatch v2 underbilled order ...
1,Can you tell me more about the UltraClean Vacu...,tell ultraclean vacuum warranty also available...
2,I ordered SoundWave 300 but got EcoBreeze AC i...,ordered soundwave 300 got ecobreeze ac instead...
3,Facing installation issue with PhotoSnap Cam. ...,facing installation issue photosnap cam setup ...
4,Order #30903 for Vision LED TV is 13 days late...,order 30903 vision led tv 13 day late ordered ...


In [12]:
df.isnull().sum()
df['clean_text'].str.split().apply(len).describe()


count    1000.000000
mean        8.711000
std         3.748537
min         1.000000
25%         7.000000
50%         8.000000
75%        10.000000
max        21.000000
Name: clean_text, dtype: float64

🚀 Step 2: Feature Engineering

In [13]:
# !pip install textblob
from textblob import TextBlob

In [14]:
# Drop rows where issue_type or urgency_level is NaN
df = df.dropna(subset=['issue_type', 'urgency_level'])


In [15]:
# Ticket Length Feature
df['ticket_length'] = df['clean_text'].apply(lambda x: len(x.split()))

In [16]:
# Sentiment Score Feature
df['sentiment'] = df['clean_text'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [17]:
# TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(df['clean_text'])


In [18]:
# Combine TF-IDF with Extra Features
from scipy.sparse import hstack

ticket_len_feat = df['ticket_length'].values.reshape(-1, 1)
sentiment_feat = df['sentiment'].values.reshape(-1, 1)

# Stack all features together
X_final = hstack([X_tfidf, ticket_len_feat, sentiment_feat])


✅ Step 3: Multi-Task Learning (Model Training)

In [19]:
# Encode categorical labels into numbers
from sklearn.preprocessing import LabelEncoder

le_issue = LabelEncoder()
le_urgency = LabelEncoder()

df['issue_label'] = le_issue.fit_transform(df['issue_type'])
df['urgency_label'] = le_urgency.fit_transform(df['urgency_level'])

In [20]:
from sklearn.model_selection import train_test_split

X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(
    X_final, df['issue_label'], test_size=0.2, random_state=42, stratify=df['issue_label']
)

X_train_u, X_test_u, y_train_u, y_test_u = train_test_split(
    X_final, df['urgency_label'], test_size=0.2, random_state=42, stratify=df['urgency_label']
)



In [21]:
from sklearn.linear_model import LogisticRegression

# Model for issue type
model_issue = LogisticRegression(max_iter=1000)
model_issue.fit(X_train_i, y_train_i)

# Model for urgency level
from sklearn.ensemble import RandomForestClassifier

model_urgency = RandomForestClassifier(n_estimators=100, random_state=42)
model_urgency.fit(X_train_u, y_train_u)



Evaluation

In [22]:
from sklearn.metrics import classification_report

# Issue type report
print("Issue Type Classification Report:")
print(classification_report(y_test_i, model_issue.predict(X_test_i), target_names=[str(cls) for cls in le_issue.classes_]))

# Urgency level report
print("Urgency Level Classification Report:")
print(classification_report(y_test_u, model_urgency.predict(X_test_u), target_names=[str(cls) for cls in le_urgency.classes_]))



Issue Type Classification Report:
                    precision    recall  f1-score   support

    Account Access       1.00      1.00      1.00        27
   Billing Problem       1.00      0.93      0.96        28
   General Inquiry       0.85      1.00      0.92        28
Installation Issue       1.00      0.96      0.98        27
     Late Delivery       1.00      0.90      0.95        21
    Product Defect       1.00      1.00      1.00        23
        Wrong Item       1.00      1.00      1.00        22

          accuracy                           0.97       176
         macro avg       0.98      0.97      0.97       176
      weighted avg       0.98      0.97      0.97       176

Urgency Level Classification Report:
              precision    recall  f1-score   support

        High       0.31      0.36      0.33        61
         Low       0.30      0.25      0.28        55
      Medium       0.42      0.42      0.42        60

    accuracy                           0.35     

Prediction function

In [23]:
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import nltk
from scipy.sparse import hstack

# Download NLTK data if not already
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

# Main function
def predict_and_extract(text, issue_model, urgency_model, tfidf_vectorizer, label_enc_issue, label_enc_urgency):
    # Step 1: Preprocess
    cleaned_text = preprocess_text(text)

    # Step 2: Feature extraction
    X_tfidf = tfidf_vectorizer.transform([cleaned_text])  # (1, n_features)

    ticket_length = np.array([[len(cleaned_text.split())]])  # (1, 1)
    sentiment_score = np.array([[TextBlob(cleaned_text).sentiment.polarity]])  # (1, 1)

    # Combine into X_final-like structure
    X_input = hstack([X_tfidf, ticket_length, sentiment_score])

    # Step 3: Predict
    issue_pred = issue_model.predict(X_input)
    urgency_pred = urgency_model.predict(X_input)

    issue_decoded = label_enc_issue.inverse_transform(issue_pred)[0]
    urgency_decoded = label_enc_urgency.inverse_transform(urgency_pred)[0]

    # Step 4: Entity extraction
    product_list = ['WidgetX', 'GadgetPro', 'SuperWidget']
    found_products = [p for p in product_list if p.lower() in text.lower()]

    # Detect dates like "May 12, 2024" or "12/05/2024"
    dates = re.findall(r'\b(?:\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\w+ \d{1,2}, \d{4})\b', text)

    # Keywords for complaints
    complaint_keywords = ['broken', 'late', 'error', 'damaged', 'not working']
    found_keywords = [w for w in complaint_keywords if w in text.lower()]

    # Return structured result
    return {
        "issue_type": issue_decoded,
        "urgency_level": urgency_decoded,
        "entities": {
            "products": found_products,
            "dates": dates,
            "complaint_keywords": found_keywords
        }
    }


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Saikiran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Saikiran\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Saikiran\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [24]:
result = predict_and_extract(
    text="My WidgetX was delivered late on May 12, 2024 and is broken.",
    issue_model=model_issue,
    urgency_model=model_urgency,
    tfidf_vectorizer=tfidf,
    label_enc_issue=le_issue,
    label_enc_urgency=le_urgency
)

print(result)


{'issue_type': 'Late Delivery', 'urgency_level': 'Low', 'entities': {'products': ['WidgetX'], 'dates': ['May 12, 2024'], 'complaint_keywords': ['broken', 'late']}}


To save models

In [25]:
import joblib

#Save models
joblib.dump(model_issue, 'model_issue.pkl')
joblib.dump(model_urgency, 'model_urgency.pkl')

#Save TF-IDF vectorizer
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

#Save label encoders
joblib.dump(le_issue, 'label_encoder_issue.pkl')
joblib.dump(le_urgency, 'label_encoder_urgency.pkl')


['label_encoder_urgency.pkl']

In [26]:
!pip install gradio



Gradio UI

In [27]:
import gradio as gr

def gradio_predict(text):
    try:
        result = predict_and_extract(
            text,
            model_issue,
            model_urgency,
            tfidf,
            le_issue,
            le_urgency
        )
        return (
            result['issue_type'],
            result['urgency_level'],
            result['entities']
        )
    except Exception as e:
        return "Error", "Error", {"error": str(e)}

gr.Interface(
    fn=gradio_predict,
    inputs=gr.Textbox(lines=5, placeholder="Enter your support ticket here..."),
    outputs=[
        gr.Textbox(label="Predicted Issue Type"),
        gr.Textbox(label="Predicted Urgency Level"),
        gr.JSON(label="Extracted Entities")
    ],
    title="Customer Support Ticket Classifier",
    description="Enter a support ticket text. The model will classify the issue type and urgency level, and extract relevant entities."
).launch()


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




Using existing dataset file at: .gradio\flagged\dataset1.csv
