# Transformer model

In [15]:
# load sample original and transcribed questions to compare
from utils import load_queries

q_id = 't1'

queries_o = load_queries(queries_version='original')
queries_t = load_queries(queries_version='wav2vec2-base-960h')
print(queries_o[q_id])  # anchor
print(queries_t[q_id])  # positive

distractor_q = 'who is a musician born in las vegas'
print(distractor_q)  # negative

who is a musician born in detroit
who is a musician born indetroit
who is a musician born in las vegas


In [5]:
# load pre-trained model
from transformers import AutoTokenizer, AutoModel

# pre_trained_model_name = "distilbert-base-uncased"
pre_trained_model_name = "sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco"

tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_name) 
bert_model = AutoModel.from_pretrained(pre_trained_model_name)

# check vocabulary size
print(tokenizer.vocab_size, 'tokens')
# show vocabulary
v = tokenizer.get_vocab()
# print(v)

30522 tokens


In [10]:
# tokenize
print(queries_o[q_id])
query_input_qo = tokenizer(queries_o[q_id], return_tensors="pt")
print(tokenizer.convert_ids_to_tokens(query_input_qo["input_ids"][0]))

print(queries_t[q_id])
query_input_qt = tokenizer(queries_t[q_id], return_tensors="pt")
print(tokenizer.convert_ids_to_tokens(query_input_qt["input_ids"][0]))

print(distractor_q)
query_input_qd = tokenizer(distractor_q, return_tensors="pt")
print(tokenizer.convert_ids_to_tokens(query_input_qd["input_ids"][0]))

who is a musician born in detroit
['[CLS]', 'who', 'is', 'a', 'musician', 'born', 'in', 'detroit', '[SEP]']
who is a musician born indetroit
['[CLS]', 'who', 'is', 'a', 'musician', 'born', 'ind', '##et', '##roi', '##t', '[SEP]']
who is a musician born in las vegas
['[CLS]', 'who', 'is', 'a', 'musician', 'born', 'in', 'las', 'vegas', '[SEP]']


In [7]:
query_encoded_qo = bert_model(**query_input_qo)[0][:,0,:].squeeze(0)
query_encoded_qt = bert_model(**query_input_qt)[0][:,0,:].squeeze(0)
query_encoded_qd = bert_model(**query_input_qd)[0][:,0,:].squeeze(0)

score0 = query_encoded_qo.dot(query_encoded_qo)
print("Original score: ", float(score0))

score = query_encoded_qo.dot(query_encoded_qt)
print("Transcript score: ", float(score))

score = query_encoded_qo.dot(query_encoded_qd)
print("Distractor score: ", float(score))

Original score:  126.30570220947266
Transcript score:  98.63985443115234
Distractor score:  106.63993072509766


# ASR model

In [10]:
# load ASR model
import os
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from datasets import load_dataset
import soundfile as sf

model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
# model.to('cuda')

# check vocabulary size
print(processor.tokenizer.vocab_size, 'tokens')
# show characters vocabulary
v = processor.tokenizer.get_vocab()
# print(v)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


32 tokens


In [16]:
# tokenize
print(queries_o[q_id])
query_input_qo = processor.tokenizer(queries_o[q_id].upper(), return_tensors="pt")
print(processor.tokenizer.convert_ids_to_tokens(query_input_qo["input_ids"][0]))

print(queries_t[q_id])
query_input_qt = processor.tokenizer(queries_t[q_id].upper(), return_tensors="pt")
print(processor.tokenizer.convert_ids_to_tokens(query_input_qt["input_ids"][0]))

# distractor_q = 'who is a musician born in las vegas'
print(distractor_q)
query_input_qd = processor.tokenizer(distractor_q.upper(), return_tensors="pt")
print(processor.tokenizer.convert_ids_to_tokens(query_input_qd["input_ids"][0]))

who is a musician born in detroit
['W', 'H', 'O', '|', 'I', 'S', '|', 'A', '|', 'M', 'U', 'S', 'I', 'C', 'I', 'A', 'N', '|', 'B', 'O', 'R', 'N', '|', 'I', 'N', '|', 'D', 'E', 'T', 'R', 'O', 'I', 'T']
who is a musician born indetroit
['W', 'H', 'O', '|', 'I', 'S', '|', 'A', '|', 'M', 'U', 'S', 'I', 'C', 'I', 'A', 'N', '|', 'B', 'O', 'R', 'N', '|', 'I', 'N', 'D', 'E', 'T', 'R', 'O', 'I', 'T']
who is a musician born in las vegas
['W', 'H', 'O', '|', 'I', 'S', '|', 'A', '|', 'M', 'U', 'S', 'I', 'C', 'I', 'A', 'N', '|', 'B', 'O', 'R', 'N', '|', 'I', 'N', '|', 'L', 'A', 'S', '|', 'V', 'E', 'G', 'A', 'S']


In [18]:
split = 'train'
q_id = 't1'

wav_path = "/ivi/ilps/personal/svakule/spoken_qa/gtts/annotated_wd_data_%s/wav/" % split
file = q_id + '.wav'

speech, samplerate = sf.read(wav_path+file)
input_values = processor(speech, return_tensors="pt", padding="longest").input_values
# input_values = input_values.to('cuda')
logits = model(input_values).logits
print(logits.shape)
logits

It is strongly recommended to pass the ``sampling_rate`` argument to this function.Failing to do so can result in silent errors that might be hard to debug.


torch.Size([1, 224, 32])


tensor([[[ 16.0426, -26.5667, -26.2056,  ...,  -6.9358,  -6.9005,  -8.1346],
         [ 16.0563, -26.6414, -26.2775,  ...,  -6.9886,  -6.9074,  -8.1332],
         [ 15.9869, -26.5113, -26.1531,  ...,  -6.8883,  -6.8722,  -8.0371],
         ...,
         [ 15.8354, -26.4650, -26.1130,  ...,  -6.6700,  -6.8191,  -7.8721],
         [ 15.8360, -26.6142, -26.2627,  ...,  -6.7499,  -6.9985,  -7.9503],
         [ 15.7416, -27.0900, -26.7318,  ...,  -7.1503,  -7.4961,  -8.5756]]],
       grad_fn=<AddBackward0>)

In [20]:
q = 'who is a musician born in las vegas'
e_label = 'las vegas'
r_label = 'born in'

query_input_q = processor.tokenizer(q.upper(), return_tensors="pt")
query_input_e = processor.tokenizer(e_label.upper(), return_tensors="pt")
query_input_r = processor.tokenizer(r_label.upper(), return_tensors="pt")

print(query_input_q)
print(query_input_e)
print(query_input_r)

{'input_ids': tensor([[18, 11,  8,  4, 10, 12,  4,  7,  4, 17, 16, 12, 10, 19, 10,  7,  9,  4,
         24,  8, 13,  9,  4, 10,  9,  4, 15,  7, 12,  4, 25,  5, 21,  7, 12]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
{'input_ids': tensor([[15,  7, 12,  4, 25,  5, 21,  7, 12]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}
{'input_ids': tensor([[24,  8, 13,  9,  4, 10,  9]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}


# New tokens

In [4]:
# load pre-trained model
from transformers import AutoTokenizer, AutoModel

# pre_trained_model_name = "distilbert-base-uncased"
pre_trained_model_name = "sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco"

tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_name) 
bert_model = AutoModel.from_pretrained(pre_trained_model_name)

# add special tokens to tokenizer vocabulary and the model
special_tokens_dict = {'additional_special_tokens': ['[Q]','[E]','[R]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
bert_model.resize_token_embeddings(len(tokenizer))
print(bert_model.get_input_embeddings)

<bound method DistilBertModel.get_input_embeddings of DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30525, 768)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072

In [27]:
q = '[Q]who is a musician born in las vegas'
e_label = '[E]las vegas'
r_label = '[R]born in'
print(q)

query_input_q = tokenizer(q, return_tensors="pt")
query_input_e = tokenizer(e_label, return_tensors="pt")
query_input_r = tokenizer(r_label, return_tensors="pt")

print(tokenizer.convert_ids_to_tokens(query_input_q["input_ids"][0]))
print(tokenizer.convert_ids_to_tokens(query_input_e["input_ids"][0]))
print(tokenizer.convert_ids_to_tokens(query_input_r["input_ids"][0]))

[Q]who is a musician born in las vegas
['[CLS]', '[Q]', 'who', 'is', 'a', 'musician', 'born', 'in', 'las', 'vegas', '[SEP]']
['[CLS]', '[E]', 'las', 'vegas', '[SEP]']
['[CLS]', '[R]', 'born', 'in', '[SEP]']


# Character tokenizer

In [9]:
# script from https://huggingface.co/google/reformer-enwik8
import torch

q = 'who is a musician born in las vegas'
e_label = 'las vegas'
r_label = 'born in'

# Encoding
def encode(list_of_strings, pad_token_id=0):
    max_length = max([len(string) for string in list_of_strings])

    # create emtpy tensors
    attention_masks = torch.zeros((len(list_of_strings), max_length), dtype=torch.long)
    input_ids = torch.full((len(list_of_strings), max_length), pad_token_id, dtype=torch.long)

    for idx, string in enumerate(list_of_strings):
        # make sure string is in byte format
        if not isinstance(string, bytes):
            string = str.encode(string)

        input_ids[idx, :len(string)] = torch.tensor([x + 2 for x in string])
        attention_masks[idx, :len(string)] = 1

    return input_ids, attention_masks

encoded, attention_masks = encode([q, e_label, r_label])
print(encoded)

tensor([[121, 106, 113,  34, 107, 117,  34,  99,  34, 111, 119, 117, 107, 101,
         107,  99, 112,  34, 100, 113, 116, 112,  34, 107, 112,  34, 110,  99,
         117,  34, 120, 103, 105,  99, 117],
        [110,  99, 117,  34, 120, 103, 105,  99, 117,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0],
        [100, 113, 116, 112,  34, 107, 112,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0]])


In [None]:
# one-hot encode
import torch.nn.functional as F

F.one_hot(encoded, num_classes=6)
