In [35]:
import pandas as pd
import re
from datetime import datetime, timedelta

df = pd.read_csv("ai_dev_assignment_tickets_complex_1000.csv")


In [36]:
# Get all unique product names (drop NaN just in case)
product_list = df['product'].dropna().unique().tolist()
product_list = [p.lower() for p in product_list]  # normalize
print("Sample product list:", product_list[:10])


Sample product list: ['smartwatch v2', 'ultraclean vacuum', 'soundwave 300', 'photosnap cam', 'vision led tv', 'ecobreeze ac', 'robochef blender', 'fitrun treadmill', 'powermax battery', 'protab x1']


In [37]:
#NLP function to extract entities from text


def extract_entities(text):
    text_lower = text.lower()

    # --- Extract Product ---



    extracted_products = set()
    text_words = set(re.findall(r'\w+', text_lower))  # tokenize text

    for product in product_list:
        product_words = set(product.lower().split())
        if product_words & text_words:  # if any word matches
            extracted_products.add(product)




    # --- Extract Dates ---

    date_entities = []
    today = datetime.today()
    if 'yesterday' in text_lower:
        date_entities.append((today - timedelta(days=1)).strftime('%Y-%m-%d'))
    if 'tomorrow' in text_lower:
        date_entities.append((today + timedelta(days=1)).strftime('%Y-%m-%d'))
    if 'today' in text_lower:
        date_entities.append((today).strftime('%Y-%m-%d'))
    
    absolute_date_patterns = [
        r'\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b',   # YYYY/MM/DD or YYYY-MM-DD
        r'\b\d{1,2}[/-]\d{1,2}[/-]\d{4}\b'    # DD/MM/YYYY or DD-MM-YYYY
    ]
    for pattern in absolute_date_patterns:
        for match in re.findall(pattern, text):
            try:
                # Try different formats
                for fmt in ['%Y/%m/%d', '%Y-%m-%d', '%d/%m/%Y', '%d-%m-%Y']:
                    try:
                        parsed = datetime.strptime(match, fmt)
                        date_entities.append(parsed.strftime('%Y-%m-%d'))
                        break
                    except:
                        continue
            except:
                continue

    # Handle textual dates: 3rd March, 5th Oct, 12 June, etc.
    month_names = r'jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec|january|february|march|april|june|july|august|september|october|november|december'
    text_date_pattern = rf'\b\d{{1,2}}(?:st|nd|rd|th)?\s+({month_names})\b'
    for match in re.findall(text_date_pattern, text_lower, flags=re.IGNORECASE):
        for m in re.finditer(rf'(\d{{1,2}}(?:st|nd|rd|th)?)\s+{match}', text_lower):
            day = re.sub(r'(st|nd|rd|th)', '', m.group(1))
            try:
                parsed = datetime.strptime(f'{day} {match}', '%d %b').replace(year=today.year)
            except:
                try:
                    parsed = datetime.strptime(f'{day} {match}', '%d %B').replace(year=today.year)
                except:
                    continue
            date_entities.append(parsed.strftime('%Y-%m-%d'))




    # --- Extract Issue keywords ---
    issue_keywords = [
        'broken', 'not working', 'doesn’t work', 'stopped working', 'damaged',
        'cracked', 'won’t start', 'flickering', 'slow', 'heating', 'noisy',
        'defective', 'malfunction', 'problem', 'error', 'issue', 'freeze'
    ]
    found_issues = []
    for keyword in issue_keywords:
        if keyword in text_lower:
            found_issues.append(keyword)

    return {
        'products': list(extracted_products),
        'dates': date_entities,
        'issues': found_issues
    }


In [38]:
extract_entities("Product  v2 is broken and I need it fixed by 5th october. It's been slow lately.")

{'products': ['smartwatch v2'],
 'dates': ['2025-10-05'],
 'issues': ['broken', 'slow']}

In [39]:
from sentence_transformers import SentenceTransformer
import numpy as np
import joblib
import string
import nltk
import pandas as pd
from textblob import TextBlob
from scipy.sparse import hstack

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler

# === Download NLTK Resources ===
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yasha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yasha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yasha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [40]:
# Load saved models for urgency classification and SBERT
clf = joblib.load('./models/xgboost_urgency_classifier.pkl')
sbert_model = joblib.load('./models/sbert_model.pkl')
# === Load Saved model for issue ===
model = joblib.load("./models/issue_model.pkl")
tfidf = joblib.load("./models/issue_vectorizer.pkl")
encoder = joblib.load("./models/issue_type_encoder.pkl")
scaler = joblib.load("./models/issue_scaler.pkl")  

In [41]:

## urgency prediction function

def urgency_score(text):
    keywords = ['urgent', 'asap', 'immediately', 'not working', 'waiting', 'help', 'escalated', 'delay', 'issue', 'problem']
    text = text.lower()
    return sum(kw in text for kw in keywords)

# run the prediction on a new text
def predict_urgency(text):
    score = urgency_score(text)
    if score >= 3:
        return "High"
    else:
        emb = sbert_model.encode([text])
        features = np.hstack([emb, np.array([[score]])])
        pred = clf.predict(features)[0]
        return "High" if pred == 1 else "Not High"

In [42]:
# issue_type_predictor



# === Define Text Preprocessing Functions ===
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = nltk.word_tokenize(clean_text(text))
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

# === Prediction Function ===
def predict_issue_type(ticket_text):
    processed = preprocess_text(ticket_text)
    length = len(ticket_text)
    sentiment = get_sentiment(ticket_text)

    X_tfidf = tfidf.transform([processed])
    X_num = scaler.transform([[length, sentiment]])
    X_final = hstack([X_tfidf, X_num])

    pred = model.predict(X_final)
    label = encoder.inverse_transform(pred)[0]
    return label




In [43]:
def analyze_ticket(text):
    # --- Issue Type Prediction ---
    issue_pred = predict_issue_type(text)

    # --- Urgency Prediction ---
    urgency_pred = predict_urgency(text)

    # --- Entity Extraction ---
    entities = extract_entities(text)

    return {
        "issue_type": issue_pred,
        "urgency_level": urgency_pred,
        "entities": entities
    }


In [46]:
analyze_ticket("Today The smartwatch is not working and we need immediate help with this issue.")



{'issue_type': 'Product Defect',
 'urgency_level': 'High',
 'entities': {'products': ['smartwatch v2'],
  'dates': ['2025-06-16'],
  'issues': ['not working', 'issue']}}