In [12]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from transformers import AutoTokenizer, TFBertModel
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
from transformers import TFDistilBertModel

# Dataset

Already Generated by GEMINI (see intentGenerator.py)

In [2]:
def load_and_prepare_data(file_path):
    """Load and prepare the dataset."""
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    df = pd.DataFrame(data)

    df['text'] = df['text'].str.strip()

    label_encoder = LabelEncoder()
    df['label'] = label_encoder.fit_transform(df['intent'])

    # Check class distribution
    print("\nClass distribution:")
    print(df['intent'].value_counts())

    return df, label_encoder

def create_tf_dataset(texts, labels, tokenizer, batch_size=16, is_training=True):
    """Create TensorFlow dataset."""
    # Tokenize texts
    encodings = tokenizer(
        texts.tolist(),
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="tf"
    )

    dataset = tf.data.Dataset.from_tensor_slices((
        {key: val for key, val in encodings.items()},
        labels
    ))

    if is_training:
        dataset = dataset.shuffle(1000)

    dataset = dataset.batch(batch_size)

    if is_training:
        dataset = dataset.prefetch(tf.data.AUTOTUNE)

    return dataset

In [3]:
file_path = "/content/jakarta_transport_intents_7354_20241214_021252.json"
df, label_encoder = load_and_prepare_data(file_path)


Class distribution:
intent
asking_for_direction      2745
service_recommendation    2485
analyzing_surroundings    2124
Name: count, dtype: int64


In [4]:
df.head()

Unnamed: 0,intent,text,label
0,asking_for_direction,What's the best way to get to the Gambir Stati...,1
1,asking_for_direction,Can you tell me how to reach the MRT station a...,1
2,asking_for_direction,I need directions to the nearest KRL station f...,1
3,asking_for_direction,Could you guide me on how to get to the TransJ...,1
4,asking_for_direction,I'm at the Dukuh Atas Station. How do I reach ...,1


# Model

In [5]:
def create_tf_dataset(texts, labels, tokenizer, batch_size=16, is_training=True):
    """Create TensorFlow dataset."""
    encodings = tokenizer(
        texts.tolist(),
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="tf"
    )

    dataset = tf.data.Dataset.from_tensor_slices((
        dict(encodings),
        labels
    ))

    if is_training:
        dataset = dataset.shuffle(1000)

    dataset = dataset.batch(batch_size)

    if is_training:
        dataset = dataset.prefetch(tf.data.AUTOTUNE)

    return dataset

# Train and Evaluate

In [13]:
class DistillbertIntentModel(tf.keras.Model):
    def __init__(self, model_name, num_classes):
        super().__init__()
        self.bert = TFDistilBertModel.from_pretrained(model_name, output_hidden_states=True)

        # Make BERT trainable
        self.bert.trainable = True

        # Additional layers
        self.dense1 = tf.keras.layers.Dense(256, activation='gelu')
        self.dropout1 = tf.keras.layers.Dropout(0.1)
        self.dense2 = tf.keras.layers.Dense(128, activation='gelu')
        self.dropout2 = tf.keras.layers.Dropout(0.1)
        self.classifier = tf.keras.layers.Dense(num_classes)

    def call(self, inputs, training=False):
        outputs = self.bert(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            training=training
        )

        hidden_states = outputs.last_hidden_state

        pooled_output = tf.reduce_mean(hidden_states, axis=1)

        # Dense layers
        x = self.dense1(pooled_output)
        x = self.dropout1(x, training=training)
        x = self.dense2(x)
        x = self.dropout2(x, training=training)

        return self.classifier(x)

In [14]:
def plot_confusion_matrix(conf_matrix, labels, fold):
    """Plot and save confusion matrix."""
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=labels, yticklabels=labels)
    plt.title(f'Confusion Matrix - Fold {fold+1}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(f'confusion_matrix_fold_{fold+1}.png')
    plt.close()

def plot_training_history(history, fold):
    """Plot training history."""
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title(f'Loss - Fold {fold+1}')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title(f'Accuracy - Fold {fold+1}')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.tight_layout()
    plt.savefig(f'training_history_fold_{fold+1}.png')
    plt.close()

In [15]:
def calculate_average_metrics(all_metrics):
    """Calculate average metrics across folds safely."""
    avg_metrics = {}

    try:
        # For each metric type we want to average
        for metric in ['accuracy', 'macro avg', 'weighted avg']:
            if metric == 'accuracy':
                scores = [fold_metrics[metric] for fold_metrics in all_metrics]
            else:
                scores = [fold_metrics[metric]['f1-score'] for fold_metrics in all_metrics]
            avg_metrics[metric] = np.mean(scores)
    except Exception as e:
        print(f"Error calculating averages: {e}")
        print("Raw metrics:", all_metrics)
        return {}

    return avg_metrics

def train_and_evaluate(df, label_encoder, model_name="distilbert-base-uncased", n_splits=5):
    """Updated training pipeline with fixed metrics calculation."""
    print("GPU Available:", tf.config.list_physical_devices('GPU'))

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    training_args = {
        'epochs': 10,
        'batch_size': 16,
        'learning_rate': 1e-5,
        'weight_decay': 0.01
    }

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    all_metrics = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(df['text'], df['label'])):
        print(f"\nTraining Fold {fold + 1}/{n_splits}")

        train_texts = df['text'].iloc[train_idx].tolist()
        train_labels = df['label'].iloc[train_idx].tolist()
        val_texts = df['text'].iloc[val_idx].tolist()
        val_labels = df['label'].iloc[val_idx].tolist()

        train_encodings = tokenizer(
            train_texts,
            truncation=True,
            padding=True,
            max_length=128,
            return_tensors="tf"
        )

        val_encodings = tokenizer(
            val_texts,
            truncation=True,
            padding=True,
            max_length=128,
            return_tensors="tf"
        )

        train_dataset = tf.data.Dataset.from_tensor_slices((
            dict(train_encodings),
            train_labels
        )).shuffle(1000).batch(training_args['batch_size'])

        val_dataset = tf.data.Dataset.from_tensor_slices((
            dict(val_encodings),
            val_labels
        )).batch(training_args['batch_size'])

        model = DistillbertIntentModel(model_name, len(label_encoder.classes_))

        optimizer = tf.keras.optimizers.AdamW(
            learning_rate=training_args['learning_rate'],
            weight_decay=training_args['weight_decay']
        )

        model.compile(
            optimizer=optimizer,
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            metrics=['accuracy']
        )

        callbacks = [
            tf.keras.callbacks.EarlyStopping(
                monitor='val_loss',
                patience=3,
                restore_best_weights=True
            ),
            tf.keras.callbacks.ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.5,
                patience=2,
                min_lr=1e-6
            )
        ]

        # Train model
        history = model.fit(
            train_dataset,
            validation_data=val_dataset,
            epochs=training_args['epochs'],
            callbacks=callbacks
        )

        # Evaluate
        predictions = model.predict(val_dataset)
        pred_labels = np.argmax(predictions, axis=-1)

        pred_labels = label_encoder.inverse_transform(pred_labels)
        true_labels = label_encoder.inverse_transform(val_labels)

        try:
            fold_report = classification_report(
                true_labels,
                pred_labels,
                output_dict=True,
                zero_division=0
            )
            all_metrics.append(fold_report)

            print(f"\nFold {fold + 1} Results:")
            print(classification_report(true_labels, pred_labels))
        except Exception as e:
            print(f"Error calculating metrics for fold {fold + 1}: {e}")

        # Save model
        model.save_weights(f'best_model_fold_{fold+1}')

        tf.keras.backend.clear_session()

    print("\nAverage Metrics Across All Folds:")
    avg_metrics = calculate_average_metrics(all_metrics)

    for metric, value in avg_metrics.items():
        print(f"{metric}: {value:.4f}")

    return all_metrics, avg_metrics

In [16]:
# Train and evaluate
all_metrics, avg_metrics = train_and_evaluate(df, label_encoder)

GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

Training Fold 1/5


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10

Fold 1 Results:
                        precision    recall  f1-score   support

analyzing_surroundings       1.00      1.00      1.00       425
  asking_for_direction       0.97      0.97      0.97       549
service_recommendation       0.97      0.97      0.97       497

              accuracy                           0.98      1471
             macro avg       0.98      0.98      0.98      1471
          weighted avg       0.98      0.98      0.98      1471


Training Fold 2/5


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Fold 2 Results:
                        precision    recall  f1-score   support

analyzing_surroundings       1.00      1.00      1.00       425
  asking_for_direction       0.99      0.98      0.98       549
service_recommendation       0.98      0.99      0.98       497

              accuracy                           0.99      1471
             macro avg       0.99      0.99      0.99      1471
          weighted avg       0.99      0.99      0.99      1471


Training Fold 3/5


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10

Fold 3 Results:
                        precision    recall  f1-score   support

analyzing_surroundings       1.00      1.00      1.00       425
  asking_for_direction       0.97      0.98      0.98       549
service_recommendation       0.98      0.96      0.97       497

              accuracy                           0.98      1471
             macro avg       0.98      0.98      0.98      1471
          weighted avg       0.98      0.98      0.98      1471


Training Fold 4/5


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

Fold 4 Results:
                        precision    recall  f1-score   support

analyzing_surroundings       1.00      1.00      1.00       425
  asking_for_direction       0.97      0.98      0.98       549
service_recommendation       0.98      0.97      0.98       497

              accuracy                           0.98      1471
             macro avg       0.98      0.98      0.98      1471
          weighted avg       0.98      0.98      0.98      1471


Training Fold 5/5


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

Fold 5 Results:
                        precision    recall  f1-score   support

analyzing_surroundings       1.00      1.00      1.00       424
  asking_for_direction       0.97      0.98      0.97       549
service_recommendation       0.97      0.96      0.97       497

              accuracy                           0.98      1470
             macro avg       0.98      0.98      0.98      1470
          weighted avg       0.98      0.98      0.98      1470


Average Metrics Across All Folds:
accuracy: 0.9818
macro avg: 0.9828
weighted avg: 0.9818


In [17]:
all_metrics

[{'analyzing_surroundings': {'precision': 1.0,
   'recall': 1.0,
   'f1-score': 1.0,
   'support': 425.0},
  'asking_for_direction': {'precision': 0.9690909090909091,
   'recall': 0.970856102003643,
   'f1-score': 0.9699727024567789,
   'support': 549.0},
  'service_recommendation': {'precision': 0.967741935483871,
   'recall': 0.96579476861167,
   'f1-score': 0.9667673716012085,
   'support': 497.0},
  'accuracy': 0.9775662814411965,
  'macro avg': {'precision': 0.9789442815249267,
   'recall': 0.9788836235384376,
   'f1-score': 0.9789133580193291,
   'support': 1471.0},
  'weighted avg': {'precision': 0.9775653643959163,
   'recall': 0.9775662814411965,
   'f1-score': 0.9775651919337677,
   'support': 1471.0}},
 {'analyzing_surroundings': {'precision': 1.0,
   'recall': 0.9976470588235294,
   'f1-score': 0.9988221436984688,
   'support': 425.0},
  'asking_for_direction': {'precision': 0.9871794871794872,
   'recall': 0.9817850637522769,
   'f1-score': 0.9844748858447488,
   'support'

In [18]:
avg_metrics

{'accuracy': 0.9817782340672503,
 'macro avg': 0.9828180945231061,
 'weighted avg': 0.981775243300936}

In [19]:
import tensorflow as tf
from transformers import AutoTokenizer
import numpy as np

def load_model_for_prediction(model_name="distilbert-base-uncased",
                            num_classes=None,
                            weights_path=None):
    """
    Load the trained model with weights
    """
    model = DistillbertIntentModel(model_name, num_classes)

    # Create a dummy input to build the model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    dummy_input = tokenizer("dummy text", return_tensors="tf", padding=True)
    _ = model(dummy_input)  # This builds the model

    # Load the weights
    if weights_path:
        model.load_weights(weights_path)

    return model

def predict_intent(text, model, label_encoder, model_name="distilbert-base-uncased", max_length=128):
    """
    Make predictions using the trained model

    Args:
        text: String or list of strings to predict
        model: Trained DistillbertIntentModel instance
        label_encoder: The LabelEncoder used during training
        model_name: Name of the BERT model used during training
        max_length: Maximum sequence length used during training

    Returns:
        Predicted labels and probabilities
    """
    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Handle both single string and list of strings
    if isinstance(text, str):
        text = [text]

    # Tokenize
    encodings = tokenizer(
        text,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='tf'
    )

    # Make prediction
    predictions = model(encodings, training=False)

    # Get probabilities
    probabilities = tf.nn.softmax(predictions, axis=-1)

    # Get predicted labels
    pred_indices = tf.argmax(probabilities, axis=-1).numpy()
    pred_labels = label_encoder.inverse_transform(pred_indices)

    return pred_labels, probabilities.numpy()

In [20]:
# 1. First load the model with weights from a specific fold
num_classes = len(label_encoder.classes_)
model = load_model_for_prediction(
    model_name="distilbert-base-uncased",
    num_classes=num_classes,
    weights_path='/content/best_model_fold_1'  # or whichever fold performed best
)

# 2. Make predictions
texts = [
    "can you guide me to Blok M?",
    "Where is the nearest escalator",
    "can you recommend ramen places"
]

predicted_labels, probabilities = predict_intent(
    texts,
    model,
    label_encoder,
    model_name="distilbert-base-uncased"
)

# 3. Print results
for text, label, probs in zip(texts, predicted_labels, probabilities):
    print(f"\nText: {text}")
    print(f"Predicted Intent: {label}")
    print(f"Confidence: {np.max(probs):.4f}")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.



Text: can you guide me to Blok M?
Predicted Intent: asking_for_direction
Confidence: 0.9863

Text: Where is the nearest escalator
Predicted Intent: analyzing_surroundings
Confidence: 0.9505

Text: can you recommend ramen places
Predicted Intent: service_recommendation
Confidence: 0.9980
