# ISSUE ML MODEL USING LOGISTIC REGRESSION 

#### We will start by loading our csv data file and ispect the data

In [33]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("ai_dev_assignment_tickets_complex_1000.csv")

# Show basic info
print("Data shape:", df.shape)
df.head()


Data shape: (1000, 5)


Unnamed: 0,ticket_id,ticket_text,issue_type,urgency_level,product
0,1,Payment issue for my SmartWatch V2. I was unde...,Billing Problem,Medium,SmartWatch V2
1,2,Can you tell me more about the UltraClean Vacu...,General Inquiry,,UltraClean Vacuum
2,3,I ordered SoundWave 300 but got EcoBreeze AC i...,Wrong Item,Medium,SoundWave 300
3,4,Facing installation issue with PhotoSnap Cam. ...,Installation Issue,Low,PhotoSnap Cam
4,5,Order #30903 for Vision LED TV is 13 days late...,Late Delivery,,Vision LED TV


In [34]:
# Check for missing values
df.isnull().sum()


ticket_id         0
ticket_text      55
issue_type       76
urgency_level    52
product           0
dtype: int64

In [35]:
# Check for duplicate ticket_text
duplicates = df.duplicated(subset='ticket_text').sum()
print(f"Duplicate rows based on ticket_text: {duplicates}")


Duplicate rows based on ticket_text: 290


#### Starting Data Cleaning Process!

In [36]:
# Drop rows with missing values in key columns
df = df.dropna(subset=['ticket_text', 'issue_type', 'urgency_level'])

# Drop duplicate ticket_text rows
df = df.drop_duplicates(subset='ticket_text')

# Reset index
df = df.reset_index(drop=True)

print("After cleaning:")
print("Shape:", df.shape)
print("Missing:\n", df.isnull().sum())


After cleaning:
Shape: (629, 5)
Missing:
 ticket_id        0
ticket_text      0
issue_type       0
urgency_level    0
product          0
dtype: int64


#### Starting Data pre-processing!

In [37]:
import re
import string

# Function to clean the text
def clean_text(text):
    text = str(text).lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply it
df['clean_text'] = df['ticket_text'].apply(clean_text)

# Preview cleaned text
df[['ticket_text', 'clean_text']].head()


Unnamed: 0,ticket_text,clean_text
0,Payment issue for my SmartWatch V2. I was unde...,payment issue for my smartwatch v2 i was under...
1,I ordered SoundWave 300 but got EcoBreeze AC i...,i ordered soundwave 300 but got ecobreeze ac i...
2,Facing installation issue with PhotoSnap Cam. ...,facing installation issue with photosnap cam s...
3,Can you tell me more about the PhotoSnap Cam w...,can you tell me more about the photosnap cam w...
4,is malfunction. It stopped working after just...,is malfunction it stopped working after just 7...


In [38]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yasha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yasha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yasha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\yasha\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [39]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_text(text):
    text = str(text)
    tokens = word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word.isalpha()]  # Keep only words
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize
    return ' '.join(tokens)

# Apply to cleaned text
df['processed_text'] = df['clean_text'].apply(preprocess_text)

# Check output
df[['ticket_text', 'processed_text']].head()


Unnamed: 0,ticket_text,processed_text
0,Payment issue for my SmartWatch V2. I was unde...,payment issue smartwatch underbilled order
1,I ordered SoundWave 300 but got EcoBreeze AC i...,ordered soundwave got ecobreeze ac instead ord...
2,Facing installation issue with PhotoSnap Cam. ...,facing installation issue photosnap cam setup ...
3,Can you tell me more about the PhotoSnap Cam w...,tell photosnap cam warranty also available red
4,is malfunction. It stopped working after just...,malfunction stopped working day


In [40]:
print("Duplicates in processed_text:", df['processed_text'].duplicated().sum())


Duplicates in processed_text: 172


In [41]:
df = df.drop_duplicates(subset='processed_text')


In [42]:
print(df['issue_type'].value_counts())


issue_type
Product Defect        98
Billing Problem       87
Wrong Item            76
General Inquiry       72
Late Delivery         53
Installation Issue    36
Account Access        35
Name: count, dtype: int64


### As data is inbalanced we start balancing the data

In [None]:
from sklearn.utils import resample

# combine all classes into a list
df_list = []

# minimum sample count among all classes
min_samples = df['issue_type'].value_counts().min()

for label in df['issue_type'].unique():
    subset = df[df['issue_type'] == label]
    df_downsampled = resample(subset, replace=False, n_samples=min_samples, random_state=42)
    df_list.append(df_downsampled)

# concatenate balanced data
df = pd.concat(df_list).sample(frac=1, random_state=42) 


In [44]:
print(df['issue_type'].value_counts())


issue_type
Billing Problem       35
Product Defect        35
Account Access        35
Late Delivery         35
General Inquiry       35
Installation Issue    35
Wrong Item            35
Name: count, dtype: int64


### Starting Taining of the data

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler


In [46]:
issue_encoder = LabelEncoder()
df['issue_type_encoded'] = issue_encoder.fit_transform(df['issue_type'])


In [47]:
# Split the data into training and test sets
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['issue_type_encoded'])


In [48]:
# TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf.fit_transform(df_train['processed_text'].astype(str))
X_test_tfidf = tfidf.transform(df_test['processed_text'].astype(str))


In [49]:
# Numerical features
df_train['ticket_length'] = df_train['ticket_text'].apply(lambda x: len(str(x)) if pd.notnull(x) else 0)
df_test['ticket_length'] = df_test['ticket_text'].apply(lambda x: len(str(x)) if pd.notnull(x) else 0)


In [50]:
# Sentiment analysis

from textblob import TextBlob
def get_sentiment(text):
    text = str(text)
    return TextBlob(text).sentiment.polarity if text.strip() else 0


In [51]:
df_train['sentiment_score'] = df_train['ticket_text'].apply(get_sentiment)
df_test['sentiment_score'] = df_test['ticket_text'].apply(get_sentiment)

scaler = StandardScaler()
X_train_num = scaler.fit_transform(df_train[['ticket_length', 'sentiment_score']])
X_test_num = scaler.transform(df_test[['ticket_length', 'sentiment_score']])


In [52]:
# Combine
X_train = hstack([X_train_tfidf, X_train_num])
X_test = hstack([X_test_tfidf, X_test_num])

# Train
y_train = df_train['issue_type_encoded']
y_test = df_test['issue_type_encoded']


In [53]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [54]:
#Predict and evaluate

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=issue_encoder.classes_))


                    precision    recall  f1-score   support

    Account Access       0.88      1.00      0.93         7
   Billing Problem       1.00      1.00      1.00         7
   General Inquiry       1.00      0.86      0.92         7
Installation Issue       1.00      1.00      1.00         7
     Late Delivery       1.00      1.00      1.00         7
    Product Defect       1.00      1.00      1.00         7
        Wrong Item       1.00      1.00      1.00         7

          accuracy                           0.98        49
         macro avg       0.98      0.98      0.98        49
      weighted avg       0.98      0.98      0.98        49



In [55]:
# Cross-validation

from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_weighted')
print("CV F1 Score:", scores.mean())



CV F1 Score: 0.9581841491841491


In [56]:
# save the model and vectorizer
import joblib
joblib.dump(model, './models/issue_model.pkl')
joblib.dump(tfidf, './models/issue_vectorizer.pkl')
joblib.dump(issue_encoder, './models/issue_type_encoder.pkl')
joblib.dump(scaler, './models/issue_scaler.pkl')

['./models/issue_scaler.pkl']