In [5]:
from sagemaker import get_execution_role
role = get_execution_role()
role

'arn:aws:iam::372344071344:role/service-role/AmazonSageMaker-ExecutionRole-20250720T013634'

In [16]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset
import torch

# Load dataset
df = pd.read_csv('s3://algoworks-assistant-intent-training/training-data/algoworks_intent_training_data_20250720.csv')

# Encode labels
le = LabelEncoder()
df['label'] = le.fit_transform(df['intent'])

# Save label mappings
id2label = {i: label for i, label in enumerate(le.classes_)}
label2id = {label: i for i, label in enumerate(le.classes_)}

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(df['utterance'], df['label'], test_size=0.1)
print(label2id)
# Tokenize
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)

# Dataset class
class IntentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            key: torch.tensor(val[idx]) for key, val in self.encodings.items()
        } | {
            'labels': torch.tensor(self.labels[idx])
        }

train_dataset = IntentDataset(train_encodings, list(train_labels))
val_dataset = IntentDataset(val_encodings, list(val_labels))

# Load model and inject label mappings 🔥
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(le.classes_),
    id2label=id2label,
    label2id=label2id
)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    # evaluation_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    logging_dir='./logs',
    logging_steps=5,
    save_strategy="epoch",
    # load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# Optional accuracy metric
from sklearn.metrics import accuracy_score
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {'accuracy': accuracy_score(labels, predictions)}

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)



{'career_opportunities': 0, 'client_testimonials': 1, 'company_overview': 2, 'connect_to_agent': 3, 'contact_info': 4, 'feedback_or_complaint': 5, 'goodbye': 6, 'greeting': 7, 'industries_served': 8, 'mobile_app_development': 9, 'pricing_or_estimation': 10, 'process_or_methodology': 11, 'salesforce_services': 12, 'service_inquiry': 13, 'tech_stack_info': 14, 'voicemail': 15, 'web_app_development': 16}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [13]:
# Train
trainer.train()

Step,Training Loss
5,2.8484
10,2.7333
15,2.72
20,2.7733
25,2.5928
30,2.6634
35,2.5069
40,2.4688
45,2.4438
50,2.2925


TrainOutput(global_step=368, training_loss=0.820489254820606, metrics={'train_runtime': 488.2672, 'train_samples_per_second': 6.005, 'train_steps_per_second': 0.754, 'total_flos': 17452072510728.0, 'train_loss': 0.820489254820606, 'epoch': 4.0})

In [14]:
sample_utterances = [
    "Who are the leaders in algoworks",
    "where is the head office",
    "Hello",
    "any open roles?"
]

for text in sample_utterances:
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    pred_id = outputs.logits.argmax().item()
    intent = le.inverse_transform([pred_id])[0]
    print(f"'{text}' => {intent}")

'Who are the leaders in algoworks' => company_overview
'where is the head office' => contact_info
'Hello' => greeting
'any open roles?' => career_opportunities


In [None]:
# Save final model 🔥 with intent names
model_dir = "trained-models"
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)
print("✅ Model saved with label mappings embedded.")
!tar czf algoworks_intent_model_20250720.tar.gz -C $model_dir .
!aws s3 cp algoworks_intent_model_20250720.tar.gz s3://algoworks-assistant-intent-training/trained-models/

✅ Model saved with label mappings embedded.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Completed 117.0 MiB/235.7 MiB (231.8 MiB/s) with 1 file(s) remaining

In [None]:
from sagemaker.huggingface import HuggingFaceModel
import sagemaker

role = sagemaker.get_execution_role()

huggingface_model = HuggingFaceModel(
   model_data='s3://algoworks-assistant-intent-training/trained-models/algoworks_intent_model_20250720.tar.gz',
   role=role,
   transformers_version='4.26',
   pytorch_version='1.13',
   py_version='py39',
   env={
       'HF_TASK':'text-classification'
   }
)

predictor = huggingface_model.deploy(
   initial_instance_count=1,
   instance_type='ml.t2.medium'
)


--

In [17]:
response = predictor.predict({
    "inputs": "connect to agent"
})
print(response)

[{'label': 'bios_password_reset', 'score': 0.14260870218276978}]
