In [23]:
!pip install nlpaug



In [24]:
import pandas as pd
import os


df = pd.read_csv('/content/data_craft.csv')
df['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
Out of Domain,48
Order Status,40
Delivery Time,40
Shipping Issues,40


In [25]:
df

Unnamed: 0,Question,Label
0,Where is my order?,Order Status
1,Has my order been shipped?,Order Status
2,Can you tell me the current status of my order?,Order Status
3,Is my order in transit?,Order Status
4,Has my order been dispatched yet?,Order Status
...,...,...
163,Can you please tell me weather today?,Out of Domain
164,What is the capital of Vietnam?,Out of Domain
165,How many Distinct Vietnam have?,Out of Domain
166,I want to buy a new laptop,Out of Domain


In [26]:
import re
def remove_special_character(text: str) -> str:
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    for idx, row in df.iterrows():
        question = row['Question']
        question = question.lower()
        df.loc[idx, 'Question'] = remove_special_character(question)
    return df

def preprocess_sentence(text: str) -> str:
    text = text.lower()
    text = remove_special_character(text)
    return text


df = preprocess(df)
df


Unnamed: 0,Question,Label
0,where is my order,Order Status
1,has my order been shipped,Order Status
2,can you tell me the current status of my order,Order Status
3,is my order in transit,Order Status
4,has my order been dispatched yet,Order Status
...,...,...
163,can you please tell me weather today,Out of Domain
164,what is the capital of vietnam,Out of Domain
165,how many distinct vietnam have,Out of Domain
166,i want to buy a new laptop,Out of Domain


In [27]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
Out of Domain,41
Delivery Time,33
Order Status,30
Shipping Issues,30


In [28]:
import nlpaug.augmenter.word as naw
import torch
def synonum_aug(text: str) -> str:
    aug = naw.SynonymAug(aug_src='wordnet')
    augmented_text = aug.augment(text)
    return augmented_text[0]

def contextual_aug_bert(text: str) -> str:
    aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute")
    augmented_text = aug.augment(text)
    return augmented_text[0]

def contextual_aug_roberta(text: str) -> str:
    aug = naw.ContextualWordEmbsAug(model_path='roberta-base', action="substitute")
    augmented_text = aug.augment(text)
    return augmented_text[0]

def random_swap(text: str) -> str:
    aug = naw.RandomWordAug(action="swap")
    augmented_text = aug.augment(text)
    return augmented_text[0]


augment_questions = []
augment_labels = []


for idx, row in train_df.iterrows():
    question = row['Question']
    label = row['Label']

    augment_questions.append(question)
    augment_labels.append(label)


    augment_questions.append(synonum_aug(question))
    augment_labels.append(label)

    augment_questions.append(contextual_aug_bert(question))
    augment_labels.append(label)

    augment_questions.append(contextual_aug_roberta(question))
    augment_labels.append(label)

    augment_questions.append(random_swap(question))
    augment_labels.append(label)

# Tạo DataFrame từ các danh sách augmented
augmented_train_df = pd.DataFrame({
    'Question': augment_questions,
    'Label': augment_labels
})



In [29]:
augmented_train_df

Unnamed: 0,Question,Label
0,is my order still in the warehouse,Order Status
1,be my order still in the storage warehouse,Order Status
2,to their order still in the line,Order Status
3,is one order still on this warehouse,Order Status
4,my is order in still warehouse the,Order Status
...,...,...
665,my package was delivered to the wrong place ho...,Shipping Issues
666,my package be save to the wrong place how pott...,Shipping Issues
667,my package is addressed to the manning family ...,Shipping Issues
668,my package was delivered here the wrong place ...,Shipping Issues


In [42]:
#concat augment and test df to save a csv file
augmented_train_df.to_csv('augmented_train_df.csv', index=False)

In [30]:
X_train = augmented_train_df['Question'].astype(str)
y_train = augmented_train_df['Label'].astype(str)
X_test = test_df['Question'].astype(str)
y_test = test_df['Label'].astype(str)


In [31]:
X_train

Unnamed: 0,Question
0,is my order still in the warehouse
1,be my order still in the storage warehouse
2,to their order still in the line
3,is one order still on this warehouse
4,my is order in still warehouse the
...,...
665,my package was delivered to the wrong place ho...
666,my package be save to the wrong place how pott...
667,my package is addressed to the manning family ...
668,my package was delivered here the wrong place ...


In [32]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression


label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

vectorizer = CountVectorizer(ngram_range=(1,2), stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


lr_classifier = LogisticRegression(max_iter=1000)
lr_classifier.fit(X_train_vec, y_train)


y_pred_lr = lr_classifier.predict(X_test_vec)

# Evaluate
print("perfomance:")
target_names = [str(cls) for cls in label_encoder.classes_]
print(classification_report(y_test, y_pred_lr, target_names=target_names))



perfomance:
                 precision    recall  f1-score   support

  Delivery Time       0.83      0.71      0.77         7
   Order Status       0.71      1.00      0.83        10
  Out of Domain       1.00      1.00      1.00         7
Shipping Issues       0.86      0.60      0.71        10

       accuracy                           0.82        34
      macro avg       0.85      0.83      0.83        34
   weighted avg       0.84      0.82      0.82        34



In [33]:


from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=1)

logregress_pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', LogisticRegression(solver='liblinear'))
])


logregress_pipeline.fit(X_train, y_train)

# Evaluate on test data
y_pred = logregress_pipeline.predict(X_test)
target_names = [str(cls) for cls in label_encoder.classes_]
print("perfomance:")
print(classification_report(y_test, y_pred, target_names=target_names))


perfomance:
                 precision    recall  f1-score   support

  Delivery Time       0.86      0.86      0.86         7
   Order Status       0.83      1.00      0.91        10
  Out of Domain       1.00      0.86      0.92         7
Shipping Issues       0.89      0.80      0.84        10

       accuracy                           0.88        34
      macro avg       0.89      0.88      0.88        34
   weighted avg       0.89      0.88      0.88        34



In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer


pipeline_aug = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

pipeline_aug.fit(X_train, y_train)


y_pred = pipeline_aug.predict(X_test)
print("perfomance:")
print(classification_report(y_test, y_pred, target_names=target_names))


perfomance:
                 precision    recall  f1-score   support

  Delivery Time       0.83      0.71      0.77         7
   Order Status       0.90      0.90      0.90        10
  Out of Domain       1.00      0.86      0.92         7
Shipping Issues       0.75      0.90      0.82        10

       accuracy                           0.85        34
      macro avg       0.87      0.84      0.85        34
   weighted avg       0.86      0.85      0.85        34



In [35]:
new_questions = ["Is my order come today?", "When will my order arrive?"]
predicted_labels = pipeline_aug.predict(new_questions)
print(predicted_labels)
print(label_encoder.inverse_transform(predicted_labels))

[1 0]
['Order Status' 'Delivery Time']


In [37]:
!pip install fasttext



In [38]:
#create a fasttext version
import fasttext
import fasttext.util
import numpy as np

fasttext_df = augmented_train_df.copy()
fasttext_df['Label'] = fasttext_df['Label'].str.replace(' ','_')
fasttext_df['fasttext'] = '__label__' + fasttext_df['Label'].astype(str) + ' ' + fasttext_df['Question']
fasttext_df['fasttext'].to_csv('fasttext_train_data.txt', index=False, header=False)

fasttext_test_df = test_df.copy()
fasttext_test_df['Label'] = fasttext_test_df['Label'].str.replace(' ','_')
fasttext_test_df['fasttext'] = '__label__' + fasttext_test_df['Label'].astype(str) + ' ' + fasttext_test_df['Question']
fasttext_test_df['fasttext'].to_csv('fasttext_test_data.txt', index=False, header=False)

In [39]:
fasttext_model = fasttext.train_supervised(
    input='/content/fasttext_train_data.txt',
    epoch=50,
    lr=0.1,
    wordNgrams=3,
    verbose=2,
    minCount=1
)
result = fasttext_model.test('/content/fasttext_test_data.txt')
print(f"Precision: {result[1]}")
print(f"Recall: {result[2]}")
print(f"Num example test: {result[0]}")


Precision: 0.8529411764705882
Recall: 0.8529411764705882
Num example test: 34


In [40]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
#ref: https://huggingface.co/learn/nlp-course/chapter3/3

def compute_metrics(pred) -> dict:
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }


X_train = augmented_train_df['Question'].astype(str)
y_train = augmented_train_df['Label'].astype(str)
X_test = test_df['Question'].astype(str)
y_test = test_df['Label'].astype(str)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(y_train.unique()))

label_dict = {label: idx for idx, label in enumerate(y_train.unique())}
y_train = y_train.map(label_dict)
y_test = y_test.map(label_dict)

#ref: https://huggingface.co/transformers/v3.2.0/custom_datasets.html
class CustomDataset(Dataset):
    def __init__(self, text, label):
        self.text = text
        self.label = label

    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        text_ = self.text.iloc[idx]
        label_ = self.label.iloc[idx]
        encoding = tokenizer(text_, truncation=True, padding='max_length', max_length=64, return_tensors='pt')
        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = torch.tensor(label_)
        return item

train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)


training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    logging_dir='./logs',               
    logging_steps=10,                 
    logging_first_step=True,            
    save_strategy="epoch",             
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

# eval
print("Performance on test set:")
eval_results = trainer.evaluate()
print(eval_results)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6645,0.70623,0.852941,0.914216,0.852941,0.864624
2,0.2165,0.303192,0.911765,0.914439,0.911765,0.911334
3,0.1071,0.212429,0.970588,0.973262,0.970588,0.970157


Performance on test set:


{'eval_loss': 0.212429478764534, 'eval_accuracy': 0.9705882352941176, 'eval_precision': 0.9732620320855616, 'eval_recall': 0.9705882352941176, 'eval_f1': 0.9701572936867054, 'eval_runtime': 0.2133, 'eval_samples_per_second': 159.421, 'eval_steps_per_second': 23.444, 'epoch': 3.0}


In [41]:
import torch
from transformers import DistilBertTokenizer

new_questions = ["Hello",
                 "Has my order come?",
                 "Which day my order will come?",
                 "Is my order shipped?",
                 "My order still in warehouse or somewhere?",
                 "I recieved a broken box",
                 "I think i put wrong address in order"]
new_question = [preprocess_sentence(question) for question in new_questions]
inputs = tokenizer(new_questions, padding=True, truncation=True, return_tensors="pt", max_length=64).to('cuda')

#predict
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=1)


label_map = {v: k for k, v in label_dict.items()}
predicted_labels = [label_map[label.item()] for label in predicted_labels]
print(predicted_labels)


['Out of Domain', 'Order Status', 'Delivery Time', 'Order Status', 'Order Status', 'Shipping Issues', 'Shipping Issues']
