In [1]:
from transformers import SpeechEncoderDecoderModel, Wav2Vec2Processor

encoder_id = "facebook/wav2vec2-base-960h"  # acoustic model encoder
decoder_id = "bert-base-uncased"  # text decoder

model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(encoder_id, decoder_id)

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2Model: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertLMHeadModel: ['cls

In [2]:
from transformers import AutoFeatureExtractor, AutoTokenizer

feature_extractor = AutoFeatureExtractor.from_pretrained(encoder_id, return_attention_mask=True)
tokenizer = AutoTokenizer.from_pretrained(decoder_id)

In [3]:
tokenizer.cls_token_id

101

In [4]:
tokenizer.pad_token_id

0

In [5]:
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.sep_token_id

In [6]:
model.config.decoder_start_token_id

101

In [7]:
from datasets import load_dataset, Dataset, Audio

In [8]:
import pandas as pd
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from tqdm.notebook import tqdm
from sklearn.utils import shuffle


valid_metadata = "/data/tone_speech_cutwav/kaldi_cutwavs_new_2020/valid/metadata.csv"
test_metadata = "/data/tone_speech_cutwav/kaldi_cutwavs_new_2020/test/metadata.csv"

seed = 42
valid_df = pd.read_csv(valid_metadata)
valid_mini_df = valid_df
valid_mini_df = valid_df.sample(n=10000, replace=False, random_state=seed)
valid_mini_df = shuffle(valid_mini_df, random_state=seed)
# 確保所有label存在
# assert valid_mini_df.value_counts("label").shape[0] == 5

test_df = pd.read_csv(test_metadata)
test_mini_df = test_df
test_mini_df = test_df.sample(n=1000, replace=False, random_state=seed)
test_mini_df = shuffle(test_mini_df, random_state=seed)
# 確保所有label存在
# assert test_mini_df.value_counts("label").shape[0] == 5



valid_dataset = Dataset.from_pandas(valid_mini_df)
test_dataset = Dataset.from_pandas(test_mini_df)

In [9]:
valid_dataset = valid_dataset.remove_columns("__index_level_0__")
test_dataset = test_dataset.remove_columns("__index_level_0__")

In [10]:
tokenizer.tokenize("lüe")

['lu', '##e']

In [11]:
valid_dataset = valid_dataset.cast_column("path", Audio(sampling_rate=16000))

In [12]:
valid_dataset[0]["path"]["array"].shape

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

(4000,)

In [13]:
input_values = feature_extractor(valid_dataset[0]["path"]["array"], return_tensors="pt", sampling_rate=16000)
input_values

{'input_values': tensor([[0.0020, 0.0057, 0.0109,  ..., 0.2269, 0.2274, 0.2221]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], dtype=torch.int32)}

In [14]:
pinyin = valid_dataset[0]["path"]["path"].split("__pin-")[-1].split("__")[0]

In [15]:
labels = tokenizer(pinyin, return_tensors="pt").input_ids
labels

tensor([[  101, 12912,   102]])

In [16]:
tokenizer.convert_ids_to_tokens(labels.tolist()[0])

['[CLS]', 'wo', '[SEP]']

In [7]:
# feat_extract_norm == 'layer' needs attention mask
model.encoder.config.feat_extract_norm

'group'

In [8]:
model.config.decoder_start_token_id 

101

In [9]:
model.config.decoder_start_token_id, model.config.pad_token_id, model.config.eos_token_id, 

(101, 0, 102)

In [10]:
processor = Wav2Vec2Processor(feature_extractor, tokenizer)
processor

Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

- tokenizer: PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [11]:
model_input_name = feature_extractor.model_input_names[0]
model_input_name

'input_values'

In [12]:
tokenizer.tokenize("xing[SEP]0")

['xi', '##ng', '[SEP]', '0']

In [13]:
model.encoder.config.apply_spec_augment = False


In [14]:
model.encoder.config.apply_spec_augment

False

In [39]:
model.save_pretrained("wav2vec2-base-bert-uncased")

In [40]:
processor.save_pretrained("wav2vec2-base-bert-uncased")

In [42]:
processor.tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})