In [1]:
import json
with open('ATIS_train.json', 'r') as f:
    data = json.load(f)

messages_list = data['rasa_nlu_data']['common_examples']
messages_list[0]

{'text': 'i want to fly from boston at 838 am and arrive in denver at 1110 in the morning',
 'intent': 'flight',
 'entities': [{'start': 19,
   'end': 25,
   'value': 'boston',
   'entity': 'fromloc.city_name'},
  {'start': 29, 'end': 35, 'value': '838 am', 'entity': 'depart_time.time'},
  {'start': 50, 'end': 56, 'value': 'denver', 'entity': 'toloc.city_name'},
  {'start': 60, 'end': 64, 'value': '1110', 'entity': 'arrive_time.time'},
  {'start': 72,
   'end': 79,
   'value': 'morning',
   'entity': 'arrive_time.period_of_day'}]}

## Step 1: Preprocess the Data

In [2]:
from transformers import AutoTokenizer

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize_and_align_labels_with_bio(data, tokenizer):
    tokenized_texts = []
    aligned_labels = []

    for item in data:
        text = item['text']
        entities = item['entities']

        # Tokenize text and create a mapping from token to word in the original text
        tokenized_input = tokenizer(text, return_offsets_mapping=True, truncation=True, padding='max_length', return_tensors="pt")
        tokens = tokenized_input.tokens()
        offsets = tokenized_input['offset_mapping'].squeeze().tolist()

        # Initialize all tokens as outside
        labels = ['O'] * len(tokens)

        for entity in entities:
            start, end = entity['start'], entity['end']
            entity_label = entity['entity']

            # Flag to indicate if we are at the beginning of an entity
            is_begin = True

            for idx, (offset_start, offset_end) in enumerate(offsets):
                # Check if this token is part of the entity
                if offset_start >= start and offset_end <= end:
                    prefix = 'B-' if is_begin else 'I-'
                    labels[idx] = f"{prefix}{entity_label}"
                    is_begin = False  # After the first token, all others are inside

        tokenized_texts.append(tokenized_input)
        aligned_labels.append(labels)

    return tokenized_texts, aligned_labels


# Tokenize and align labels for your data
tokenized_texts, aligned_labels = tokenize_and_align_labels_with_bio(messages_list, tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## Step 2: Choose a Pre-trained Model

In [3]:
unique_labels = set()  # Set to store unique labels

# Iterate over the aligned labels to collect unique labels
for labels in aligned_labels:
    unique_labels.update(labels)  # Add all labels from this example to the set

# Calculate the number of unique labels
number_of_labels = len(unique_labels)

number_of_labels


131

In [4]:
# function that creates a mapping from label to id and from id to label
def create_label_mapping(unique_labels):
    # Create a mapping from labels to ids and vice versa
    label2id = {label: i for i, label in enumerate(unique_labels)}
    id2label = {i: label for label, i in label2id.items()}

    return label2id, id2label

# Create label mappings
label2id, id2label = create_label_mapping(sorted(unique_labels))
print(label2id)

{'B-aircraft_code': 0, 'B-airline_code': 1, 'B-airline_name': 2, 'B-airport_code': 3, 'B-airport_name': 4, 'B-arrive_date.date_relative': 5, 'B-arrive_date.day_name': 6, 'B-arrive_date.day_number': 7, 'B-arrive_date.month_name': 8, 'B-arrive_date.today_relative': 9, 'B-arrive_time.end_time': 10, 'B-arrive_time.period_mod': 11, 'B-arrive_time.period_of_day': 12, 'B-arrive_time.start_time': 13, 'B-arrive_time.time': 14, 'B-arrive_time.time_relative': 15, 'B-city_name': 16, 'B-class_type': 17, 'B-connect': 18, 'B-cost_relative': 19, 'B-day_name': 20, 'B-day_number': 21, 'B-days_code': 22, 'B-depart_date.date_relative': 23, 'B-depart_date.day_name': 24, 'B-depart_date.day_number': 25, 'B-depart_date.month_name': 26, 'B-depart_date.today_relative': 27, 'B-depart_date.year': 28, 'B-depart_time.end_time': 29, 'B-depart_time.period_mod': 30, 'B-depart_time.period_of_day': 31, 'B-depart_time.start_time': 32, 'B-depart_time.time': 33, 'B-depart_time.time_relative': 34, 'B-economy': 35, 'B-fare_a

In [5]:
aligned_labels_ids = [[label2id[label] for label in example_labels] for example_labels in aligned_labels]
print(aligned_labels_ids)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [6]:
from sklearn.model_selection import train_test_split

# Assuming token_vectors and tags_numerical are your full dataset.
train_tokenized_texts, test_tokenized_texts, train_aligned_labels_ids, test_aligned_labels_ids = train_test_split(
    tokenized_texts, aligned_labels_ids, test_size=0.2, random_state=42
)


## Step 3: Fine-Tune the Model

In [7]:
from transformers import AutoModelForTokenClassification, AutoConfig

# Load the configuration of the model
config = AutoConfig.from_pretrained('bert-base-uncased', num_labels=number_of_labels)

# Load the pre-trained model for token classification
model = AutoModelForTokenClassification.from_config(config)


3.1. Prepare the Dataset

In [8]:
import torch
from torch.utils.data import Dataset

class EntityExtractionDataset(Dataset):
    def __init__(self, tokenized_data, labels):
        self.tokenized_data = tokenized_data
        self.labels = labels

    def __len__(self):
        return len(self.tokenized_data)

    def __getitem__(self, idx):
        item_tokenized_data = self.tokenized_data[idx]
        item_labels = self.labels[idx]

        # Remove the extra singleton dimension
        item = {key: value.squeeze(0) for key, value in item_tokenized_data.items() if key != 'offset_mapping'}

        # Ensure 'labels' also does not have the extra dimension
        item['labels'] = torch.tensor(item_labels, dtype=torch.long).squeeze(0)

        return item


# Convert your data into the dataset
train_dataset = EntityExtractionDataset(train_tokenized_texts, train_aligned_labels_ids)


3.2. Set Training Arguments

In [9]:
!pip install accelerate -U



In [10]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # Directory where the model predictions and checkpoints will be written
    num_train_epochs=3,              # Total number of training epochs
    per_device_train_batch_size=16,  # Batch size per device during training
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
)


3.3. Initialize the Trainer

In [11]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)


3.4. Train

In [12]:
trainer.train()

Step,Training Loss
500,0.2194


TrainOutput(global_step=747, training_loss=0.14877611789677836, metrics={'train_runtime': 1167.6634, 'train_samples_per_second': 10.231, 'train_steps_per_second': 0.64, 'total_flos': 3125091545376768.0, 'train_loss': 0.14877611789677836, 'epoch': 3.0})

## evaluate the model

In [13]:
test_dataset = EntityExtractionDataset(test_tokenized_texts, test_aligned_labels_ids)

In [14]:
eval_results = trainer.evaluate(test_dataset)
print("Evaluation Results:", eval_results)

Evaluation Results: {'eval_loss': 0.004322611261159182, 'eval_runtime': 33.2301, 'eval_samples_per_second': 29.973, 'eval_steps_per_second': 3.762, 'epoch': 3.0}


In [15]:
predictions, labels, _ = trainer.predict(test_dataset)

In [16]:
import numpy as np
import torch

# Apply softmax to convert logits to probabilities
probabilities = torch.softmax(torch.tensor(predictions), dim=-1)

# Convert probabilities to label IDs
predicted_label_ids = torch.argmax(probabilities, dim=-1).numpy()

# Convert the label IDs to label strings using id2label
predicted_labels = [[id2label[label_id] for label_id in sentence] for sentence in predicted_label_ids]
true_labels = [[id2label[label_id] for label_id in sentence] for sentence in test_aligned_labels_ids]

In [17]:
from sklearn.metrics import classification_report

all_true_labels = []
all_predictions = []

# test_aligned_labels_ids [data_len, 512] label id
# predictions [data_len, 512, 131] float
# predicted_labels [data_len, 512] label

# test_aligned_labels_ids[0][0]

# for i in range(len(test_dataset)):
#     all_true_labels.extend([id2label[label_id] for label_id in test_aligned_labels_ids[i]])
all_predictions.extend(predicted_labels)
all_true_labels.extend(true_labels)

# all_true_labels

target_names = sorted(unique_labels)

all_true_labels = [label for sublist in all_true_labels for label in sublist]
all_predictions = [prediction for sublist in all_predictions for prediction in sublist]

report = classification_report(all_true_labels, all_predictions, labels=sorted(unique_labels), target_names=target_names)
print(report)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                              precision    recall  f1-score   support

             B-aircraft_code       0.00      0.00      0.00         3
              B-airline_code       1.00      0.96      0.98        25
              B-airline_name       0.98      0.99      0.98       138
              B-airport_code       1.00      0.60      0.75         5
              B-airport_name       1.00      0.11      0.20         9
 B-arrive_date.date_relative       0.00      0.00      0.00         1
      B-arrive_date.day_name       1.00      0.23      0.38        13
    B-arrive_date.day_number       0.00      0.00      0.00         7
    B-arrive_date.month_name       1.00      0.14      0.25         7
B-arrive_date.today_relative       0.00      0.00      0.00         0
      B-arrive_time.end_time       0.00      0.00      0.00         3
    B-arrive_time.period_mod       0.00      0.00      0.00         0
 B-arrive_time.period_of_day       0.00      0.00      0.00        13
    B-arrive_time.s

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(all_true_labels, all_predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9988900916164659
