In [6]:
from transformers import (
    Trainer,
    TrainingArguments,
    DistilBertForSequenceClassification,
    DistilBertTokenizerFast, # need this because we are using distil bert for classification (the fast version has bindings in c)
    DataCollatorWithPadding, # The collater is responsible for generated batches of data
    pipeline,
)
# we do distil bert for sequence classification for the speed
from datasets import load_metric, Dataset
import numpy as np

In [7]:
snips_file= open('../data/snips.train.txt', 'rb')
snips_rows = snips_file.readlines()
print(snips_rows[:20])


[b'listen O\r\n', b'to O\r\n', b'westbam B-artist\r\n', b'alumb O\r\n', b'allergic B-album\r\n', b'on O\r\n', b'google B-service\r\n', b'music I-service\r\n', b'PlayMusic\r\n', b'\r\n', b'add O\r\n', b'step B-entity_name\r\n', b'to I-entity_name\r\n', b'me I-entity_name\r\n', b'to O\r\n', b'the O\r\n', b'50 B-playlist\r\n', b'cl\xc3\xa1sicos I-playlist\r\n', b'playlist O\r\n', b'AddToPlaylist\r\n']


In [8]:
utterances = []
tokenized_utterances = []
labels_for_tokens = []
sequence_labels = []

utterance, tokenized_utterance, label_for_utterances = ',',[],[]
for snip_row in snips_rows:
    if len(snip_row)==2: #skip over rows with no data
        continue
    if ' ' not in snip_row.decode(): #we've hit a sequence label
        sequence_labels.append(snip_row.decode().strip())
        utterances.append(utterance.strip())
        tokenized_utterances.append(tokenized_utterance)
        labels_for_tokens.append(label_for_utterances)
        utterance, tokenized_utterance, label_for_utterances = '',[],[]
        continue
    token, token_label = snip_row.decode().split(' ')
    token_label = token_label.strip()
    utterance +=f'{token}'
    tokenized_utterance.append(token)
    label_for_utterances.append(token_label)

print(tokenized_utterances[0])
print(labels_for_tokens[0])
print(utterances[0])
print(sequence_labels[0])

['listen', 'to', 'westbam', 'alumb', 'allergic', 'on', 'google', 'music']
['O', 'O', 'B-artist', 'O', 'B-album', 'O', 'B-service', 'I-service']
,listentowestbamalumballergicongooglemusic
PlayMusic


In [9]:
unique_sequence_labels = list(set(sequence_labels))

In [10]:
sequence_labels = [unique_sequence_labels.index(sequence_label) for sequence_label in sequence_labels]
print(f'There are {len(unique_sequence_labels)} unique sequence labels')

There are 7 unique sequence labels


In [11]:
from functools import reduce
unique_token_labels = list(set(reduce(lambda x,y: x+y, labels_for_tokens)))
labels_for_tokens = [[unique_token_labels.index(_) for _ in l] for l in labels_for_tokens]
print(f'there are {len(unique_token_labels)} unique token labels')

there are 72 unique token labels


In [13]:
snips_dataset = Dataset.from_dict(
    dict(
        utterance=utterances,
        label=sequence_labels,
        tokens=tokenized_utterances,
        token_labels=labels_for_tokens
    )
)
# now we can split u pthe dtaset
snips_dataset = snips_dataset.train_test_split(test_size=0.2) #80 percent train, 20 percent test

In [14]:
snips_dataset['train'][0]

{'utterance': 'bookconeyislandneighboringvaformarinaandi',
 'label': 2,
 'tokens': ['book',
  'coney',
  'island',
  'neighboring',
  'va',
  'for',
  'marina',
  'and',
  'i'],
 'token_labels': [39, 37, 41, 31, 12, 39, 11, 9, 9]}

In [15]:
tokenizer =DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
def preprocess_function(examples):
    return tokenizer(examples["utterance"], truncation=True)

tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 90.6kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 3.09MB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 6.30MB/s]
config.json: 100%|██████████| 483/483 [00:00<00:00, 4.36MB/s]


In [18]:
seq_clf_tokenized_snips = snips_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 10467/10467 [00:00<00:00, 18039.86 examples/s]
Map: 100%|██████████| 2617/2617 [00:00<00:00, 29854.69 examples/s]


In [20]:
seq_clf_tokenized_snips["train"][0]

{'utterance': 'bookconeyislandneighboringvaformarinaandi',
 'label': 2,
 'tokens': ['book',
  'coney',
  'island',
  'neighboring',
  'va',
  'for',
  'marina',
  'and',
  'i'],
 'token_labels': [39, 37, 41, 31, 12, 39, 11, 9, 9],
 'input_ids': [101,
  2338,
  8663,
  3240,
  2483,
  3122,
  2638,
  18377,
  12821,
  2075,
  3567,
  14192,
  27943,
  5685,
  2072,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [21]:
"""Data collatore with padding creates batch of data. It also dynamically pads text to the
length of the longest element in the batch, making them all the same length.
It's possible to pad your text in the tokenizer with padding=True, dynamic padding is more efficient.
"""

data_collator= DataCollatorWithPadding(tokenizer=tokenizer)

# attention mask is how we ignore attention scores for padding tokens

In [22]:
sequence_clf_model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels = len(unique_sequence_labels)
)

sequence_clf_model.config.id2label = {i:label for i, label in enumerate(unique_sequence_labels)}

model.safetensors: 100%|██████████| 268M/268M [00:04<00:00, 56.1MB/s] 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
metric = load_metric('accuracy')

def custom_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)