<a href="https://colab.research.google.com/github/zy4kamu/FireMisha/blob/main/intent_classifier_and_slot_filler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 0: preliminary actions

In [1]:
# Install HuggingFace transformers

!pip install git+https://github.com/huggingface/transformers.git

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-3x3kucvq
  Running command git clone -q https://github.com/huggingface/transformers.git /tmp/pip-req-build-3x3kucvq
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (PEP 517) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-3.3.1-cp36-none-any.whl size=1082350 sha256=75bb1cd6dc87be6376acbbcac121b538bc594b91ac26c76c4dc741ffb806739c
  Stored in directory: /tmp/pip-ephem-wheel-cache-blocrj0x/wheels/33/eb/3b/4bf5dd835e865e472d4fc0754f35ac0edb08fe852e8f21655f
Successfully built transformers


In [2]:
# Download dataset for training

!rm -rf nlu-benchmark
!git clone https://github.com/sonos/nlu-benchmark.git

Cloning into 'nlu-benchmark'...
remote: Enumerating objects: 11, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 400 (delta 2), reused 0 (delta 0), pack-reused 389[K
Receiving objects: 100% (400/400), 1.19 MiB | 13.88 MiB/s, done.
Resolving deltas: 100% (248/248), done.


In [3]:
# Import required packages and set logging verbosity

import json
import os
import random
import string
import sys

import numpy as np
import tensorflow as tf

import transformers
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification
from transformers import TFDistilBertForTokenClassification


tf.get_logger().setLevel('ERROR')
transformers.logging.set_verbosity_error()

# Step 1: Train intent classifier

In [4]:
# Create training/validation dataset and save files to disk

def parse_files(input_files, output_file, labels):
    def parse_file(input_file, labels, output):
        with open(input_file, errors='ignore') as reader:
            data = json.load(reader)
        assert len(data) == 1
        label = list(data.items())[0][0]
        if not label in labels:
          labels.append(label)
        label_index = labels.index(label)
        data = list(data.items())[0][1]
        for sentence in data:
            sentence = sentence['data']
            text = ''.join([_['text'] for _ in sentence]).lower()
            output.append(json.dumps({'text':text, 'label':label_index}))
        return output

    output = []
    for input_file in input_files:
        parse_file(input_file, labels, output)
    random.shuffle(output)
    with open(output_file, 'w') as writer:
        writer.write('\n'.join(output))
    with open('intent_classes.json', 'w') as writer:
        writer.write(json.dumps(labels))


input_files = [
    'nlu-benchmark/2017-06-custom-intent-engines/SearchCreativeWork/train_SearchCreativeWork_full.json',
    'nlu-benchmark/2017-06-custom-intent-engines/RateBook/train_RateBook_full.json',
    'nlu-benchmark/2017-06-custom-intent-engines/SearchScreeningEvent/train_SearchScreeningEvent_full.json',
    'nlu-benchmark/2017-06-custom-intent-engines/AddToPlaylist/train_AddToPlaylist_full.json',
    'nlu-benchmark/2017-06-custom-intent-engines/PlayMusic/train_PlayMusic_full.json',
    'nlu-benchmark/2017-06-custom-intent-engines/GetWeather/train_GetWeather_full.json',
    'nlu-benchmark/2017-06-custom-intent-engines/BookRestaurant/train_BookRestaurant_full.json'
]
intent_classes = []
parse_files(input_files, 'train_classification_dataset.json', intent_classes)

input_files = [
    'nlu-benchmark/2017-06-custom-intent-engines/SearchCreativeWork/validate_SearchCreativeWork.json',
    'nlu-benchmark/2017-06-custom-intent-engines/RateBook/validate_RateBook.json',
    'nlu-benchmark/2017-06-custom-intent-engines/SearchScreeningEvent/validate_SearchScreeningEvent.json',
    'nlu-benchmark/2017-06-custom-intent-engines/AddToPlaylist/validate_AddToPlaylist.json',
    'nlu-benchmark/2017-06-custom-intent-engines/PlayMusic/validate_PlayMusic.json',
    'nlu-benchmark/2017-06-custom-intent-engines/GetWeather/validate_GetWeather.json',
    'nlu-benchmark/2017-06-custom-intent-engines/BookRestaurant/validate_BookRestaurant.json'
]
parse_files(input_files, 'validate_classification_dataset.json', intent_classes)

In [5]:
# Read training and validate datasets

def read_dataset(input_file):
  sentences = []
  labels = []
  with open(input_file) as reader:
    for line in reader:
      item = json.loads(line)
      sentences.append(item['text'])
      labels.append(item['label'])
  return sentences, labels

def read_intent_classes(input_file):
  with open(input_file) as reader:
    return json.load(reader)

training_sentences, training_labels = \
  read_dataset('train_classification_dataset.json')
validation_sentences, validation_labels = \
  read_dataset('validate_classification_dataset.json')
intent_classes = read_intent_classes('intent_classes.json')

print('Intent classes:', intent_classes)
print('Training sentences:', training_sentences[0:5], '...')
print('Training labels:', training_labels[0:5], '...')
print('Validation sentences:', validation_sentences[0:5], '...')
print('Validation labels:', validation_labels[0:5], '...')


Intent classes: ['SearchCreativeWork', 'RateBook', 'SearchScreeningEvent', 'AddToPlaylist', 'PlayMusic', 'GetWeather', 'BookRestaurant']
Training sentences: ['i want to see the tv series a state of mind', 'add the song by brian larsen to the cardio playlist', 'book a reservation for 7 people at the french laundry on june the 8th, 2029', 'book for jessie, dale wright and lupe at a bistro on feb. 20, 2040', 'play some rock & roll by deezer.'] ...
Training labels: [0, 3, 6, 6, 4] ...
Validation sentences: ['is we are northern lights playing in any movie theatre', 'is patrick still lives showing at amc theaters', 'can i see ellis island revisited in 1 minute', 'i want to book a restaurant in niger for seven people.', 'please look up the novel, live to dance.'] ...
Validation labels: [2, 2, 2, 6, 0] ...


In [6]:
# Create training and validation datasets for TensorFlow backend

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(training_sentences,
                            truncation=True,
                            padding=True)
val_encodings = tokenizer(validation_sentences,
                          truncation=True,
                          padding=True)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    training_labels
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    validation_labels
))

In [7]:
# Train the model

model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',
                                                              num_labels=len(intent_classes))
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
model.fit(train_dataset.shuffle(100).batch(16),
          epochs=3,
          batch_size=16,
          validation_data=val_dataset.shuffle(100).batch(16))
model.save_pretrained('classification_model')

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [8]:
# Load model and make prediction on specific sentence 

classification_model = TFDistilBertForSequenceClassification.from_pretrained('classification_model')


def get_intent_class(test_sentence):
  predict_input = tokenizer.encode(test_sentence,
                                  truncation=True,
                                  padding=True,
                                  return_tensors='tf')
  probabilities = tf.nn.softmax(classification_model.predict(predict_input)[0]).numpy()[0]
  max_index = np.argmax(probabilities)
  return intent_classes[max_index], probabilities


print()
intent_class, probabilities = get_intent_class('play a jack lawrence concerto')
for label, probability in zip(intent_classes, probabilities):
  print('{}: {:0.5f}'.format(label, probability))
print()
print('Best intent class:', intent_class)


SearchCreativeWork: 0.00201
RateBook: 0.00006
SearchScreeningEvent: 0.00009
AddToPlaylist: 0.00026
PlayMusic: 0.99749
GetWeather: 0.00003
BookRestaurant: 0.00006

Best intent class: PlayMusic


# Step 2: train slot filler for AddToPlayList dataset

In [9]:
# Create training dataset and save files to disk


def parse_files(input_files, output_file, 
                classification_restrict_dic, entity_dic):
    def parse_file(input_file, classification_restrict_dic, entity_dic, output):
        with open(input_file, errors='ignore') as reader:
            data = json.load(reader)
        assert len(data) == 1
        classification_label = list(data.items())[0][0]
        data = list(data.items())[0][1]
        if classification_label not in classification_restrict_dic:
          classification_restrict_dic[classification_label] = []
        for sentence in data:
            sentence = sentence['data']
            text = ''.join([_['text'] for _ in sentence]).lower()
            tokens = tokenizer.tokenize(text)
            token_indexes = tokenizer(text)
            char_indexes = get_char_indexes_from_tokenized(tokens)
            segment_indexes = get_segment_indexes(sentence)
            entities = [get_entity(_, segment_indexes, entity_dic) for _ in 
                        char_indexes]
            for entity in entities:
              if not entity in classification_restrict_dic[classification_label]:
                classification_restrict_dic[classification_label].append(entity)
            to_add = {'input_ids':cut(token_indexes['input_ids']), 
                      'attention_mask':cut(token_indexes['attention_mask']), 
                      'labels':cut(entities),
                      'tokens':cut(tokens)}
            output.append(json.dumps(to_add))
        return output

    def cut(data):
        MAX_SEQ_LENGTH = 41
        if len(data) > MAX_SEQ_LENGTH:
            data = data[:MAX_SEQ_LENGTH]
        while len(data) < MAX_SEQ_LENGTH:
            data.append(0)
        return data

    def get_char_indexes_from_tokenized(tokens):
        indexes = []
        index = -1
        for token in tokens:
            if not token.startswith('##'):
                index += 1
            indexes.append(index)
            index += len(token.replace('##', ''))
        return indexes

    def get_segment_indexes(sentence):
        segments = []
        index = 0
        for item in sentence:
            if 'entity' in item:
                segments.append((item['entity'], index, index + len(item['text'])))
            index += len(item['text'])
        return segments

    def get_entity(index, sentence, entity_dic):
        val = ''
        for entity, start, end in sentence:
            if index >= start and index < end:
                val = entity
                break
        if val not in entity_dic:
            ind = len(entity_dic)
            entity_dic[val] = ind
        return entity_dic[val]

    output = []
    for input_file in input_files:
        parse_file(input_file, classification_restrict_dic, entity_dic, output)

    entity_dic = sorted(entity_dic.items(), key=lambda x: x[1])
    entity_dic = [x for x,y in entity_dic]
    with open('entity_dic.json', 'w') as writer:
        writer.write(json.dumps(entity_dic))

    with open('classification_restrict_dic.json', 'w') as writer:
      writer.write(json.dumps(classification_restrict_dic))

    random.shuffle(output)
    with open(output_file, 'w') as writer:
        writer.write('\n'.join(output))


input_files = [
    'nlu-benchmark/2017-06-custom-intent-engines/SearchCreativeWork/train_SearchCreativeWork_full.json',
    'nlu-benchmark/2017-06-custom-intent-engines/RateBook/train_RateBook_full.json',
    'nlu-benchmark/2017-06-custom-intent-engines/SearchScreeningEvent/train_SearchScreeningEvent_full.json',
    'nlu-benchmark/2017-06-custom-intent-engines/AddToPlaylist/train_AddToPlaylist_full.json',
    'nlu-benchmark/2017-06-custom-intent-engines/PlayMusic/train_PlayMusic_full.json',
    'nlu-benchmark/2017-06-custom-intent-engines/GetWeather/train_GetWeather_full.json',
    'nlu-benchmark/2017-06-custom-intent-engines/BookRestaurant/train_BookRestaurant_full.json'
]
classification_restrict_dic = {}
entity_dic = {}
parse_files(input_files, 'train_slots_dataset.json', 
            classification_restrict_dic, entity_dic)

input_files = [
    'nlu-benchmark/2017-06-custom-intent-engines/SearchCreativeWork/validate_SearchCreativeWork.json',
    'nlu-benchmark/2017-06-custom-intent-engines/RateBook/validate_RateBook.json',
    'nlu-benchmark/2017-06-custom-intent-engines/SearchScreeningEvent/validate_SearchScreeningEvent.json',
    'nlu-benchmark/2017-06-custom-intent-engines/AddToPlaylist/validate_AddToPlaylist.json',
    'nlu-benchmark/2017-06-custom-intent-engines/PlayMusic/validate_PlayMusic.json',
    'nlu-benchmark/2017-06-custom-intent-engines/GetWeather/validate_GetWeather.json',
    'nlu-benchmark/2017-06-custom-intent-engines/BookRestaurant/validate_BookRestaurant.json'
]
parse_files(input_files, 'validate_slots_dataset.json',
            classification_restrict_dic, entity_dic)

In [12]:
# Load dataset

with open('entity_dic.json') as reader:
  slot_labels_dict = json.load(reader)
print('Dictionary:')
for i, item in enumerate(slot_labels_dict):
  print('    {}: "{}"'.format(i, item))

with open('classification_restrict_dic.json') as reader:
  classification_restrict_dic = json.load(reader)
print('Classification restrict dictionary:')
for k, v in classification_restrict_dic.items():
  print('    {}: {}'.format(k, v))

def create_dataset(input_file):
  input_ids = []
  attention_mask = []
  labels = []
  with open('train_slots_dataset.json') as reader:
    for line in reader:
      to_add = json.loads(line)
      input_ids.append(to_add['input_ids'])
      attention_mask.append(to_add['attention_mask'])
      labels.append(to_add['labels'])
    print('Sample from {}:'.format(input_file))
    print('    Input ids:', input_ids[0][0:10])
    print('    Attention mask:', attention_mask[0][0:10])
    print('    Labels:', labels[0][0:10])
  dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids':input_ids, 'attention_mask':attention_mask},
    labels
  ))
  return dataset

train_slots_dataset = create_dataset('train_slots_dataset.json')
validate_slots_dataset = create_dataset('validate_slots_dataset.json')

Dictionary:
    0: ""
    1: "object_type"
    2: "object_name"
    3: "rating_value"
    4: "best_rating"
    5: "rating_unit"
    6: "object_select"
    7: "object_part_of_series_type"
    8: "location_name"
    9: "movie_name"
    10: "object_location_type"
    11: "timeRange"
    12: "movie_type"
    13: "spatial_relation"
    14: "music_item"
    15: "playlist"
    16: "artist"
    17: "playlist_owner"
    18: "entity_name"
    19: "track"
    20: "service"
    21: "year"
    22: "album"
    23: "sort"
    24: "genre"
    25: "geographic_poi"
    26: "condition_description"
    27: "current_location"
    28: "condition_temperature"
    29: "state"
    30: "city"
    31: "country"
    32: "restaurant_name"
    33: "restaurant_type"
    34: "poi"
    35: "served_dish"
    36: "party_size_number"
    37: "cuisine"
    38: "facility"
    39: "party_size_description"
Classification restrict dictionary:
    SearchCreativeWork: [0, 1, 2]
    RateBook: [0, 2, 3, 4, 5, 6, 1, 7]
    SearchS

In [None]:
# Train the model

model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-uncased',
                                                           num_labels=len(slot_labels_dict))
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
model.fit(train_slots_dataset.shuffle(100).batch(16),
          epochs=3,
          batch_size=16,
          validation_data=validate_slots_dataset.shuffle(100).batch(16))
model.save_pretrained('slots.model')

Epoch 1/3


  return py_builtins.overload_of(f)(*args)




In [14]:
# Load model and make prediction on specific sentence 

slots_model = TFDistilBertForTokenClassification.from_pretrained('slots.model')

def fill_slots(test_sentence, restrict_labels=None):
  def argmax(input):
    if not restrict_labels:
      return np.argmax(input)
    max_value = -1e+10
    max_index = -1
    for i in restrict_labels:
      val = input[i]
      if val > max_value:
        max_index = i
        max_value = val
    return max_index

  tokenized = tokenizer.tokenize(test_sentence)
  predict_input = tokenizer.encode(test_sentence,
                                   return_tensors='tf')
  tf_output = slots_model.predict(predict_input)[0][0, :, :]
  length = min(tf_output.shape[0], len(tokenized))
  predictions = [argmax(tf_output[_, :]) for _ in range(length)]

  slots = []
  for token, label in zip(tokenized, predictions):
    if (slots and label == slots[-1][1]):
      slots[-1][0] += token[2:] if token.startswith('##') else ' ' + token
    else:
      slots.append([token, label])

  result = ''
  for token, label in slots:
    if label == 0:
      result += token + ' '
    else:
      result += '[' + slot_labels_dict[label] + ': ' + token + '] '
  return result.strip()

In [15]:
# Slot filling test

print(fill_slots('add another song to cita romantica playlist'))

add another [music_item: song] to [playlist: cita romantica] playlist


# Step 3: Complete test

In [16]:
# Final test

def predict_class_and_slots(test_sentence):
  intent_class, probabilities = get_intent_class(test_sentence)
  restrict_labels = classification_restrict_dic[intent_class]
  slots = fill_slots(test_sentence, restrict_labels)
  return intent_class, slots

for test_sentence in ['play a jack lawrence concerto', 
                      'what is the weather today?',
                      'i would like to add visjoner to my playlist']:
  intent_class, slots = predict_class_and_slots(test_sentence)
  print('Test sentence:', test_sentence)
  print('Intent class:', intent_class)
  print('Slots:', slots)
  print()


Test sentence: play a jack lawrence concerto
Intent class: PlayMusic
Slots: play a [artist: jack lawrence] [music_item: concerto]

Test sentence: what is the weather today?
Intent class: GetWeather
Slots: what is the weather [timeRange: today] ?

Test sentence: i would like to add visjoner to my playlist
Intent class: AddToPlaylist
Slots: i would like to add [entity_name: visjoner] to [playlist_owner: my] playlist

