In [2]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [3]:
df = pd.read_csv(r'artifacts\customer_support_data.csv')

In [4]:
df.head()

Unnamed: 0,flags,instruction,category,intent,response
0,B,question about cancelling order {{Order Number}},ORDER,cancel_order,I've understood you have a question regarding ...
1,BQZ,i have a question about cancelling oorder {{Or...,ORDER,cancel_order,I've been informed that you have a question ab...
2,BLQZ,i need help cancelling puchase {{Order Number}},ORDER,cancel_order,I can sense that you're seeking assistance wit...
3,BL,I need to cancel purchase {{Order Number}},ORDER,cancel_order,I understood that you need assistance with can...
4,BCELN,"I cannot afford this order, cancel purchase {{...",ORDER,cancel_order,I'm sensitive to the fact that you're facing f...


In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26872 entries, 0 to 26871
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   flags        26872 non-null  object
 1   instruction  26872 non-null  object
 2   category     26872 non-null  object
 3   intent       26872 non-null  object
 4   response     26872 non-null  object
dtypes: object(5)
memory usage: 1.0+ MB
None


In [6]:
# Check for missing values
print("Missing values:\n", df.isnull().sum())

Missing values:
 flags          0
instruction    0
category       0
intent         0
response       0
dtype: int64


In [7]:
# Display unique categories
print("\nUnique Categories:\n", df["category"].value_counts())


Unique Categories:
 category
ACCOUNT         5986
ORDER           3988
REFUND          2992
INVOICE         1999
CONTACT         1999
PAYMENT         1998
FEEDBACK        1997
DELIVERY        1994
SHIPPING        1970
SUBSCRIPTION     999
CANCEL           950
Name: count, dtype: int64


In [8]:
# Display unique intents
print("\nUnique Intents:\n", df["intent"].value_counts())


Unique Intents:
 intent
edit_account                1000
switch_account              1000
check_invoice               1000
complaint                   1000
contact_customer_service    1000
delivery_period              999
registration_problems        999
check_payment_methods        999
contact_human_agent          999
payment_issue                999
newsletter_subscription      999
get_invoice                  999
place_order                  998
cancel_order                 998
track_refund                 998
change_order                 997
get_refund                   997
create_account               997
check_refund_policy          997
review                       997
set_up_shipping_address      997
delivery_options             995
delete_account               995
recover_password             995
track_order                  995
change_shipping_address      973
check_cancellation_fee       950
Name: count, dtype: int64


In [9]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.strip()  # Remove extra spaces
    return text

In [10]:
df['instruction'] = df['instruction'].apply(clean_text)

In [23]:
def output_data_category(df):
    X_train, X_test, y_train_category, y_test_category = train_test_split(
        df["instruction"], df["category"], 
        test_size=0.2, random_state=42, stratify=df["category"]
    )

    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(X_train)  # Fit only on training data
    X_test = vectorizer.transform(X_test)  # Transform test data
    
    return X_train, X_test, y_train_category, y_test_category, vectorizer


def output_data_intent(self):
 
        X_train, X_test, y_train_intent, y_test_intent = train_test_split(
            df["instruction"], df["intent"], 
            test_size=0.2, random_state=42, stratify=df["intent"]
        )

        vectorizer_intent = TfidfVectorizer(max_features=5000)
        X_train = vectorizer_intent.fit_transform(X_train)  # Fit only on training data
        X_test = vectorizer_intent.transform(X_test)  # Transform test data
    
        return X_train, X_test, y_train_intent, y_test_intent

In [24]:
# Train-test split for category
X_train_category, X_test_category, y_train_category, y_test_category, vectorizer_category = output_data_category(df)

In [15]:
# Train-test split for intent
X_train_intent, X_test_intent, y_train_intent, y_test_intent = output_data_intent(df)

In [16]:
print("Shape of X_train:", X_train.shape)

Shape of X_train: (21497, 2523)


In [17]:
print("Shape of X_test:", X_test.shape)

Shape of X_test: (5375, 2523)


In [22]:
# Random Forest for Category
category_model = RandomForestClassifier(class_weight='balanced', random_state=42)
category_model.fit(X_train_category, y_train_category)
category_preds = category_model.predict(X_test_category)

# Logistic Regression for Intent
intent_model = RandomForestClassifier(class_weight='balanced', random_state=42)
intent_model.fit(X_train_intent, y_train_intent)
intent_preds = intent_model.predict(X_test_intent)

# Evaluate Models
print("Category Classification Report:\n", classification_report(y_test_category, category_preds))
print("Intent Classification Report:\n", classification_report(y_test_intent, intent_preds))

# Accuracy Scores
print(f"Category Accuracy: {accuracy_score(y_test_category, category_preds):.2f}")
print(f"Intent Accuracy: {accuracy_score(y_test_intent, intent_preds):.2f}")

Category Classification Report:
               precision    recall  f1-score   support

     ACCOUNT       0.99      1.00      0.99      1197
      CANCEL       0.99      1.00      1.00       190
     CONTACT       1.00      1.00      1.00       400
    DELIVERY       0.99      1.00      0.99       399
    FEEDBACK       1.00      1.00      1.00       399
     INVOICE       1.00      1.00      1.00       400
       ORDER       1.00      1.00      1.00       798
     PAYMENT       1.00      0.98      0.99       400
      REFUND       1.00      1.00      1.00       598
    SHIPPING       1.00      0.97      0.99       394
SUBSCRIPTION       1.00      0.99      1.00       200

    accuracy                           1.00      5375
   macro avg       1.00      0.99      1.00      5375
weighted avg       1.00      1.00      1.00      5375

Intent Classification Report:
                           precision    recall  f1-score   support

            cancel_order       0.98      0.99      0.99 

for Category --> Logistic Regression works!

In [30]:
#for Intent
#Random Forest
intent_model = RandomForestClassifier(class_weight='balanced', random_state=42)
intent_model.fit(X_train, y_train_intent)
intent_preds = intent_model.predict(X_test)

In [31]:
print("Intent Classification Report:\n", classification_report(y_test_intent, intent_preds))
print(f"Intent Accuracy: {accuracy_score(y_test_intent, intent_preds):.2f}")

Intent Classification Report:
                           precision    recall  f1-score   support

            cancel_order       0.98      0.98      0.98       200
            change_order       0.99      0.99      0.99       199
 change_shipping_address       0.99      0.99      0.99       195
  check_cancellation_fee       0.99      1.00      1.00       190
           check_invoice       0.99      0.96      0.98       200
   check_payment_methods       1.00      1.00      1.00       200
     check_refund_policy       1.00      0.99      1.00       199
               complaint       1.00      1.00      1.00       200
contact_customer_service       1.00      0.99      0.99       200
     contact_human_agent       0.99      0.99      0.99       200
          create_account       0.99      0.98      0.98       199
          delete_account       0.98      0.98      0.98       199
        delivery_options       0.99      1.00      1.00       199
         delivery_period       1.00      0.9

In [27]:
s = "Hi, I would like to cancel my order"

val = vectorizer_category.transform([s])
category_model.predict(val)

array(['ORDER'], dtype=object)