# Transformer model

In [12]:
# load sample original and transcribed questions to compare
from utils import load_queries

split = 'train'
q_id = 't1'

queries_o = load_queries(split='train', queries_version='original')
queries_t = load_queries(split='train', queries_version='wav2vec2-base-960h')
print(queries_o[q_id])
print(queries_t[q_id])

who is a musician born in detroit
who is a musician born indetroit


In [7]:
# load pre-trained model
from transformers import AutoTokenizer, AutoModel

# pre_trained_model_name = "distilbert-base-uncased"
pre_trained_model_name = "sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco"

tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_name) 
bert_model = AutoModel.from_pretrained(pre_trained_model_name)

# check vocabulary size
print(tokenizer.vocab_size, 'tokens')

30522 tokens


In [8]:
# show vocabulary
v = tokenizer.get_vocab()
print(v)



In [8]:
# tokenize
print(queries_o[q_id])
query_input_qo = tokenizer(queries_o[q_id], return_tensors="pt")
print(tokenizer.convert_ids_to_tokens(query_input_qo["input_ids"][0]))

print(queries_t[q_id])
query_input_qt = tokenizer(queries_t[q_id], return_tensors="pt")
print(tokenizer.convert_ids_to_tokens(query_input_qt["input_ids"][0]))

distractor_q = 'who is a musician born in las vegas'
print(distractor_q)
query_input_qd = tokenizer(distractor_q, return_tensors="pt")
print(tokenizer.convert_ids_to_tokens(query_input_qd["input_ids"][0]))

who is a musician born in detroit
['[CLS]', 'who', 'is', 'a', 'musician', 'born', 'in', 'detroit', '[SEP]']
who is a musician born indetroit
['[CLS]', 'who', 'is', 'a', 'musician', 'born', 'ind', '##et', '##roi', '##t', '[SEP]']
who is a musician born in las vegas
['[CLS]', 'who', 'is', 'a', 'musician', 'born', 'in', 'las', 'vegas', '[SEP]']


In [9]:
query_encoded_qo = bert_model(**query_input_qo)[0][:,0,:].squeeze(0)
query_encoded_qt = bert_model(**query_input_qt)[0][:,0,:].squeeze(0)
query_encoded_qd = bert_model(**query_input_qd)[0][:,0,:].squeeze(0)

score0 = query_encoded_qo.dot(query_encoded_qo)
print("Original score: ", float(score0))

score = query_encoded_qo.dot(query_encoded_qt)
print("Transcript score: ", float(score))

score = query_encoded_qo.dot(query_encoded_qd)
print("Distractor score: ", float(score))

Original score:  126.30570220947266
Transcript score:  98.63985443115234
Distractor score:  106.63993072509766


# ASR model

In [1]:
# load ASR model
import os
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from datasets import load_dataset
import soundfile as sf

model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model.to('cuda')

# check vocabulary size
print(processor.tokenizer.vocab_size, 'tokens')

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


32 tokens


In [2]:
# show characters vocabulary
v = processor.tokenizer.get_vocab()
print(v)

{'<pad>': 0, '<s>': 1, '</s>': 2, '<unk>': 3, '|': 4, 'E': 5, 'T': 6, 'A': 7, 'O': 8, 'N': 9, 'I': 10, 'H': 11, 'S': 12, 'R': 13, 'D': 14, 'L': 15, 'U': 16, 'M': 17, 'W': 18, 'C': 19, 'F': 20, 'G': 21, 'Y': 22, 'P': 23, 'B': 24, 'V': 25, 'K': 26, "'": 27, 'X': 28, 'J': 29, 'Q': 30, 'Z': 31}


In [35]:
# tokenize
print(queries_o[q_id])
query_input_qo = processor.tokenizer(queries_o[q_id].upper(), return_tensors="pt")
print(processor.tokenizer.convert_ids_to_tokens(query_input_qo["input_ids"][0]))

print(queries_t[q_id])
query_input_qt = processor.tokenizer(queries_t[q_id].upper(), return_tensors="pt")
print(processor.tokenizer.convert_ids_to_tokens(query_input_qt["input_ids"][0]))

# distractor_q = 'who is a musician born in las vegas'
print(distractor_q)
query_input_qd = processor.tokenizer(distractor_q.upper(), return_tensors="pt")
print(processor.tokenizer.convert_ids_to_tokens(query_input_qd["input_ids"][0]))

who is a musician born in detroit
['W', 'H', 'O', '|', 'I', 'S', '|', 'A', '|', 'M', 'U', 'S', 'I', 'C', 'I', 'A', 'N', '|', 'B', 'O', 'R', 'N', '|', 'I', 'N', '|', 'D', 'E', 'T', 'R', 'O', 'I', 'T']
who is a musician born indetroit
['W', 'H', 'O', '|', 'I', 'S', '|', 'A', '|', 'M', 'U', 'S', 'I', 'C', 'I', 'A', 'N', '|', 'B', 'O', 'R', 'N', '|', 'I', 'N', 'D', 'E', 'T', 'R', 'O', 'I', 'T']
who is a musician born in las vegas
['W', 'H', 'O', '|', 'I', 'S', '|', 'A', '|', 'M', 'U', 'S', 'I', 'C', 'I', 'A', 'N', '|', 'B', 'O', 'R', 'N', '|', 'I', 'N', '|', 'L', 'A', 'S', '|', 'V', 'E', 'G', 'A', 'S']


In [6]:
split = 'train'
q_id = 't1'

wav_path = "/ivi/ilps/personal/svakule/spoken_qa/gtts/annotated_wd_data_%s/wav/" % split
file = q_id + '.wav'

speech, samplerate = sf.read(wav_path+file)
input_values = processor(speech, return_tensors="pt", padding="longest").input_values
input_values = input_values.to('cuda')
logits = model(input_values).logits
print(logits.shape)
logits

It is strongly recommended to pass the ``sampling_rate`` argument to this function.Failing to do so can result in silent errors that might be hard to debug.


torch.Size([1, 224, 32])


tensor([[[ 16.0428, -26.5670, -26.2058,  ...,  -6.9360,  -6.9008,  -8.1345],
         [ 16.0565, -26.6411, -26.2772,  ...,  -6.9885,  -6.9073,  -8.1329],
         [ 15.9872, -26.5123, -26.1541,  ...,  -6.8889,  -6.8728,  -8.0373],
         ...,
         [ 15.8355, -26.4661, -26.1141,  ...,  -6.6704,  -6.8197,  -7.8724],
         [ 15.8359, -26.6150, -26.2634,  ...,  -6.7503,  -6.9988,  -7.9506],
         [ 15.7413, -27.0888, -26.7306,  ...,  -7.1496,  -7.4953,  -8.5754]]],
       device='cuda:0', grad_fn=<AddBackward0>)