In [1]:
# Step 1: Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


In [2]:
# Step 2: Load Dataset (make sure CSV is in the same folder)
df = pd.read_csv("Customer_support_data.csv")

# Keep only relevant columns
df = df[['Customer Remarks', 'category']]

# Drop rows with missing or empty remarks
df = df.dropna()
df = df[df['Customer Remarks'].str.strip() != '']
df.head()


Unnamed: 0,Customer Remarks,category
7,Very good,Returns
8,Shopzilla app and it's all coustomer care serv...,Returns
11,Very bad,Order Related
17,Something,Order Related
19,All good,Order Related


In [3]:
# Step 3: Clean the text data
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.strip()
    return text

df['cleaned'] = df['Customer Remarks'].apply(clean_text)
df.head()


Unnamed: 0,Customer Remarks,category,cleaned
7,Very good,Returns,very good
8,Shopzilla app and it's all coustomer care serv...,Returns,shopzilla app and its all coustomer care servi...
11,Very bad,Order Related,very bad
17,Something,Order Related,something
19,All good,Order Related,all good


In [4]:
# Step 4: Convert text into numbers using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=3000)
X = tfidf.fit_transform(df['cleaned'])
y = df['category']

X.shape, y.shape


((28742, 3000), (28742,))

In [5]:
# Step 5: Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Step 6: Train a Logistic Regression model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)


In [7]:
# Step 7: Predict and evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


                    precision    recall  f1-score   support

       App/website       0.00      0.00      0.00         3
      Cancellation       0.00      0.00      0.00       122
          Feedback       0.00      0.00      0.00       153
 Offers & Cashback       0.00      0.00      0.00        35
Onboarding related       0.00      0.00      0.00         6
     Order Related       0.46      0.19      0.27      1504
            Others       0.00      0.00      0.00         7
  Payments related       0.00      0.00      0.00       144
   Product Queries       0.50      0.01      0.02       261
    Refund Related       0.33      0.01      0.01       309
           Returns       0.55      0.94      0.69      3016
 Shopzilla Related       0.56      0.03      0.05       189

          accuracy                           0.54      5749
         macro avg       0.20      0.10      0.09      5749
      weighted avg       0.47      0.54      0.44      5749



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# Step 8: Test your own customer remark
your_input = "My order is delayed, and I want to cancel it!"
your_input_clean = clean_text(your_input)
your_input_vector = tfidf.transform([your_input_clean])
predicted_category = model.predict(your_input_vector)

print("Predicted Issue Category:", predicted_category[0])


Predicted Issue Category: Order Related
