In [2]:
import pandas as pd
data = pd.read_csv("Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv")
data

Unnamed: 0,flags,instruction,category,intent,response
0,B,question about cancelling order {{Order Number}},ORDER,cancel_order,I've understood you have a question regarding ...
1,BQZ,i have a question about cancelling oorder {{Or...,ORDER,cancel_order,I've been informed that you have a question ab...
2,BLQZ,i need help cancelling puchase {{Order Number}},ORDER,cancel_order,I can sense that you're seeking assistance wit...
3,BL,I need to cancel purchase {{Order Number}},ORDER,cancel_order,I understood that you need assistance with can...
4,BCELN,"I cannot afford this order, cancel purchase {{...",ORDER,cancel_order,I'm sensitive to the fact that you're facing f...
...,...,...,...,...,...
26867,BL,I am waiting for a rebate of {{Refund Amount}}...,REFUND,track_refund,Thank you for sharing your situation regarding...
26868,BIL,how to see if there is anything wrong with my ...,REFUND,track_refund,Ensuring the accuracy of your restitution is o...
26869,BLQZ,I'm waiting for a reimbjrsement of {{Currency ...,REFUND,track_refund,"Firstly, I genuinely understand the importance..."
26870,BL,I don't know what to do to see my reimbursemen...,REFUND,track_refund,I've understood you're unsure about how to che...


In [3]:
print("null columns:")
print(data.isnull().sum())
print()
print("column datatypes:")
print(data.dtypes)
print()
duplicate_count = data.duplicated(keep=False).sum()
print(f"Total number of duplicate rows: {duplicate_count}")

null columns:
flags          0
instruction    0
category       0
intent         0
response       0
dtype: int64

column datatypes:
flags          object
instruction    object
category       object
intent         object
response       object
dtype: object

Total number of duplicate rows: 0


In [3]:
print(data['intent'].unique())
print(data['category'].unique())

['cancel_order' 'change_order' 'change_shipping_address'
 'check_cancellation_fee' 'check_invoice' 'check_payment_methods'
 'check_refund_policy' 'complaint' 'contact_customer_service'
 'contact_human_agent' 'create_account' 'delete_account'
 'delivery_options' 'delivery_period' 'edit_account' 'get_invoice'
 'get_refund' 'newsletter_subscription' 'payment_issue' 'place_order'
 'recover_password' 'registration_problems' 'review'
 'set_up_shipping_address' 'switch_account' 'track_order' 'track_refund']
['ORDER' 'SHIPPING' 'CANCEL' 'INVOICE' 'PAYMENT' 'REFUND' 'FEEDBACK'
 'CONTACT' 'ACCOUNT' 'DELIVERY' 'SUBSCRIPTION']


In [4]:
data['category'] = pd.Categorical(data['category'])
data['intent'] = pd.Categorical(data['intent'])
print(data.dtypes)

flags            object
instruction      object
category       category
intent         category
response         object
dtype: object


In [5]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')
from nltk.stem import PorterStemmer, WordNetLemmatizer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def stem_and_lemmatize_text(text):
    words = nltk.word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(stemmed_words), " ".join(lemmatized_words)


data['instruction_stemmed'], data['instruction_lemmatized'] = zip(*data['instruction'].apply(stem_and_lemmatize_text))
data['response_stemmed'], data['response_lemmatized'] = zip(*data['response'].apply(stem_and_lemmatize_text))

print(data[['instruction', 'instruction_stemmed', 'instruction_lemmatized', 'response', 'response_stemmed', 'response_lemmatized']].head())

[nltk_data] Downloading package punkt to /home/mk/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/mk/nltk_data...
[nltk_data] Downloading package punkt_tab to /home/mk/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


                                         instruction  \
0   question about cancelling order {{Order Number}}   
1  i have a question about cancelling oorder {{Or...   
2    i need help cancelling puchase {{Order Number}}   
3         I need to cancel purchase {{Order Number}}   
4  I cannot afford this order, cancel purchase {{...   

                                 instruction_stemmed  \
0   question about cancel order { { order number } }   
1  i have a question about cancel oorder { { orde...   
2     i need help cancel puchas { { order number } }   
3      i need to cancel purchas { { order number } }   
4  i can not afford thi order , cancel purchas { ...   

                              instruction_lemmatized  \
0  question about cancelling order { { Order Numb...   
1  i have a question about cancelling oorder { { ...   
2  i need help cancelling puchase { { Order Numbe...   
3     I need to cancel purchase { { Order Number } }   
4  I can not afford this order , cancel purcha

In [None]:
import pandas as pd
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()


all_text = data['instruction_lemmatized'] + ' ' + data['response_lemmatized']
vectorizer.fit(all_text)


instruction_tfidf = vectorizer.transform(data['instruction_lemmatized'])
response_tfidf = vectorizer.transform(data['response_lemmatized'])




print(vectorizer.get_feature_names_out())


print(instruction_tfidf.shape)

['00' '00004587345current' '00108' ... 'zero' 'zip' 'zone']
(26872, 6821)


In [7]:
from sklearn.model_selection import train_test_split

# Assuming 'intent' is your target variable and you have instruction_tfidf and response_tfidf
X = data[['instruction_lemmatized', 'response_lemmatized']]  # Using original lemmatized text for splitting
y = data['intent']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now apply TF-IDF transformation to the split data
X_train_tfidf = vectorizer.transform(X_train['instruction_lemmatized'] + ' ' + X_train['response_lemmatized'])
X_test_tfidf = vectorizer.transform(X_test['instruction_lemmatized'] + ' ' + X_test['response_lemmatized'])

# You have X_train_tfidf, X_test_tfidf, y_train, y_test for model training
print("X_train_tfidf shape:", X_train_tfidf.shape)
print("X_test_tfidf shape:", X_test_tfidf.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train_tfidf shape: (21497, 6821)
X_test_tfidf shape: (5375, 6821)
y_train shape: (21497,)
y_test shape: (5375,)


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 1. Naive Bayes
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)
nb_predictions = nb_classifier.predict(X_test_tfidf)

# Evaluate Naive Bayes
print("Naive Bayes Performance:")
print("Accuracy:", accuracy_score(y_test, nb_predictions))
print("Precision:", precision_score(y_test, nb_predictions, average='weighted'))
print("Recall:", recall_score(y_test, nb_predictions, average='weighted'))
print("F1-score:", f1_score(y_test, nb_predictions, average='weighted'))
print("\n")  # Add a newline for better readability

# 2. Logistic Regression
lr_classifier = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='auto')
lr_classifier.fit(X_train_tfidf, y_train)
lr_predictions = lr_classifier.predict(X_test_tfidf)

# Evaluate Logistic Regression
print("Logistic Regression Performance:")
print("Accuracy:", accuracy_score(y_test, lr_predictions))
print("Precision:", precision_score(y_test, lr_predictions, average='weighted'))
print("Recall:", recall_score(y_test, lr_predictions, average='weighted'))
print("F1-score:", f1_score(y_test, lr_predictions, average='weighted'))
print("\n")

# 3. Linear SVM
svm_classifier = LinearSVC(max_iter=10000)
svm_classifier.fit(X_train_tfidf, y_train)
svm_predictions = svm_classifier.predict(X_test_tfidf)

# Evaluate Linear SVM
print("Linear SVM Performance:")
print("Accuracy:", accuracy_score(y_test, svm_predictions))
print("Precision:", precision_score(y_test, svm_predictions, average='weighted'))
print("Recall:", recall_score(y_test, svm_predictions, average='weighted'))
print("F1-score:", f1_score(y_test, svm_predictions, average='weighted'))
print("\n")

# 4. Random Forest
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_tfidf, y_train)
rf_predictions = rf_classifier.predict(X_test_tfidf)

# Evaluate Random Forest
print("Random Forest Performance:")
print("Accuracy:", accuracy_score(y_test, rf_predictions))
print("Precision:", precision_score(y_test, rf_predictions, average='weighted'))
print("Recall:", recall_score(y_test, rf_predictions, average='weighted'))
print("F1-score:", f1_score(y_test, rf_predictions, average='weighted'))

Naive Bayes Performance:
Accuracy: 0.9882790697674418
Precision: 0.9884474942207078
Recall: 0.9882790697674418
F1-score: 0.9882903649711775


Logistic Regression Performance:
Accuracy: 0.9973953488372093
Precision: 0.9974164582549064
Recall: 0.9973953488372093
F1-score: 0.9973956486030214






Linear SVM Performance:
Accuracy: 0.9992558139534884
Precision: 0.9992577800643507
Recall: 0.9992558139534884
F1-score: 0.9992559009041629


Random Forest Performance:
Accuracy: 0.9903255813953489
Precision: 0.9903932364909602
Recall: 0.9903255813953489
F1-score: 0.9903315281840581


In [9]:
import random

user_input = input("Enter your message: ")

def preprocess_text(text):
    # Apply the same preprocessing steps used for training data:
    # 1. Tokenization using nltk.word_tokenize
    # 2. Lemmatization using WordNetLemmatizer
    words = nltk.word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized_words)

preprocessed_input = preprocess_text(user_input)
# Transform the preprocessed input using the trained vectorizer
input_tfidf = vectorizer.transform([preprocessed_input])  # Note: [preprocessed_input] creates a list

# Predict the intent using the trained SVM model
predicted_intent = svm_classifier.predict(input_tfidf)[0]

# Find a response from your dataset with the predicted intent
possible_responses = data[data['intent'] == predicted_intent]['response'].tolist()

# Select a response randomly to avoid repetition
if possible_responses:  # Check if there are any responses for the intent
    response = random.choice(possible_responses)
else:
    response = "I'm sorry, I don't understand your request."  # Default response if no matching intent found

print("Response:", response)

Response: I've realized that you need assistance with canceling purchase {{Order Number}}. To initiate the cancellation process, please follow these steps:

1. Log Into Your Account: Access our platform by signing in to your {{Online Company Portal Info}}.
2. Navigate to Your Orders: Once logged in, head over to the '{{Online Order Interaction}}' or '{{Online Order Interaction}}' section.
3. Find Your Purchase: Within this section, locate the purchase associated with the order number {{Order Number}}.
4. Initiate Cancellation: Look for the option labeled '{{Online Order Interaction}}' linked to your purchase. Please select it to begin the cancellation process.
5. Complete Any Further Steps: The system might ask you a few questions to confirm or collect feedback. Please provide the necessary information as instructed.

If you encounter any difficulties or have further questions along the way, our dedicated team is here to assist you. Reach out to us during {{Customer Support Hours}} at 

In [10]:
# prompt: create a pkl flie of this model and export it

import pickle

# Assuming 'svm_classifier' is the trained model you want to pickle
model_to_pickle = svm_classifier

# Define the filename for the pickle file
filename = 'svm_model.pkl'

# Open the file in binary write mode
with open(filename, 'wb') as f:
    # Use pickle.dump to serialize the model and save it to the file
    pickle.dump(model_to_pickle, f)

print(f"Model saved to {filename}")

# To verify, you can load the model back
# with open(filename, 'rb') as f:
# prompt: create a pkl flie of this model and export it

import pickle

# Assuming 'svm_classifier' is the trained model you want to pickle
model_to_pickle = svm_classifier

# Define the filename for the pickle file
filename = 'svm_model.pkl'

# Open the file in binary write mode
with open(filename, 'wb') as f:
    # Use pickle.dump to serialize the model and save it to the file
    pickle.dump(model_to_pickle, f)

print(f"Model saved to {filename}")

# To verify, you can load the model back
# with open(filename, 'rb') as f:
#     loaded_model = pickle.load(f)
# print("Model loaded successfully.")

Model saved to svm_model.pkl
Model saved to svm_model.pkl
