In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer

In [2]:
print(f"Step1: Data Preparation")

Step1: Data Preparation


In [3]:

print(f"Loading CLININC150 dataset")

clinic_150 = load_dataset("clinc_oos", 'imbalanced', split="train")

# for i in range(clinic_150.num_rows):
#     print(clinic_150[i])
    

Loading CLININC150 dataset


In [4]:
print(f"Define tokenizer")

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(data):
    return tokenizer(data["text"], padding='max_length', truncation=True, max_length=25)

tokenized_data = clinic_150.map(preprocess_data, batched=True)
print(tokenized_data)

Define tokenizer
Dataset({
    features: ['text', 'intent', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10625
})


In [5]:
tokenized_data = tokenized_data.map(lambda examples: {'labels': examples['intent']}, batched=True)

In [6]:
print(f"Data Preparation Done")

Data Preparation Done


In [7]:
print(f"Step2: Model Definition")

Step2: Model Definition


In [8]:
import torch
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments


model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=150)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
print(f"Define training arguments")

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
)


Define training arguments




In [10]:
print(f"Define trainer")

from datasets import DatasetDict

# Split dataset into training and evaluation
train_datset = DatasetDict({"train": tokenized_data, "eval": tokenized_data.select(range(100))})


trainer = Trainer(model=model, args=training_args, train_dataset=train_datset["train"], eval_dataset=train_datset["eval"])

trainer.train()

Define trainer


  0%|          | 0/3325 [00:00<?, ?it/s]

{'loss': 3.0937, 'grad_norm': 10.068410873413086, 'learning_rate': 4.24812030075188e-05, 'epoch': 0.75}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 1.104393482208252, 'eval_runtime': 0.5262, 'eval_samples_per_second': 190.038, 'eval_steps_per_second': 24.705, 'epoch': 1.0}
{'loss': 0.8489, 'grad_norm': 3.673288106918335, 'learning_rate': 3.49624060150376e-05, 'epoch': 1.5}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.11031027138233185, 'eval_runtime': 0.3501, 'eval_samples_per_second': 285.605, 'eval_steps_per_second': 37.129, 'epoch': 2.0}
{'loss': 0.2815, 'grad_norm': 0.4165792763233185, 'learning_rate': 2.7443609022556393e-05, 'epoch': 2.26}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.02683493308722973, 'eval_runtime': 0.3131, 'eval_samples_per_second': 319.388, 'eval_steps_per_second': 41.52, 'epoch': 3.0}
{'loss': 0.1003, 'grad_norm': 0.22343063354492188, 'learning_rate': 1.9924812030075188e-05, 'epoch': 3.01}
{'loss': 0.0444, 'grad_norm': 0.21801017224788666, 'learning_rate': 1.2406015037593984e-05, 'epoch': 3.76}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.013460145331919193, 'eval_runtime': 0.3127, 'eval_samples_per_second': 319.775, 'eval_steps_per_second': 41.571, 'epoch': 4.0}
{'loss': 0.025, 'grad_norm': 0.14227886497974396, 'learning_rate': 4.887218045112782e-06, 'epoch': 4.51}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.011172687634825706, 'eval_runtime': 0.3838, 'eval_samples_per_second': 260.577, 'eval_steps_per_second': 33.875, 'epoch': 5.0}
{'train_runtime': 635.9231, 'train_samples_per_second': 83.54, 'train_steps_per_second': 5.229, 'train_loss': 0.6627739338408736, 'epoch': 5.0}


TrainOutput(global_step=3325, training_loss=0.6627739338408736, metrics={'train_runtime': 635.9231, 'train_samples_per_second': 83.54, 'train_steps_per_second': 5.229, 'total_flos': 683415475312500.0, 'train_loss': 0.6627739338408736, 'epoch': 5.0})

In [11]:
print(f"Model Training Done")

Model Training Done


In [12]:
print(f"Adding Intent Disambiguation layer")

Adding Intent Disambiguation layer


In [13]:
def predict_intent(input_text, threshold = 0.7):
    
     # Tokenize the input
    imputs = tokenizer(input_text, return_tensors="pt", trubncation=True, padding=True)
    
    # Get the model outputs/predictions
    outputs = model(**inputs)
    probs = torch.nn.functional_softmax(outputs.logits, dim=-1)
    max_prob, pred_label = torch.max(probs, dim=-1)
    
    if max_prob < threshold:
        return "Ambiguous Intent", max_prob.item()
    
    return pred_label.item(), max_prob.item()

In [14]:
print(f"Intent clarification layer")

def clarify_intent():
    return "Can you provide more details about what you need?"

Intent clarification layer


In [15]:
def chatbot_interaction():
    user_input = input("You: ")
    if user_input.lower() == "exit":
        print("ChatBot: Goodbye!")
        return False
    
    # predict the user intent
    intent, confidence = predict_intent(user_input)
    if intent == "Ambiguous Intent":
        print(f"Chatbot: {clarify_intent()}")
    else:
        print(f"ChatBot: I understand your intent (ID: {intent}) with confidence {confidence:.2f}.")
        
        print(f"ChatBot: I think you are looking for {clinic_150.features['intent']['feature'].int2str(intent)}")
    return True

In [None]:
print("Chatbot is running! Type 'exit' to stop.")
keep_running = True
while keep_running:
    keep_running = chatbot_interaction()

Chatbot is running! Type 'exit' to stop.


In [None]:
I want to change my mail address
