In [12]:
import torch
import torchaudio
from torch.utils.data import DataLoader

from transformers import (
    Speech2TextProcessor, 
    Speech2TextTokenizer, 
    Speech2TextForConditionalGeneration
)

In [2]:
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
tokenizer = Speech2TextTokenizer.from_pretrained("facebook/s2t-small-librispeech-asr")
model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")

In [3]:
def preprocessing(sample: Tuple[torch.Tensor, int, str, int, int, int]) -> Dict[str, torch.Tensor]:
    batch = dict()
    batch["input_values"] = processor(
        sample[0][0], sampling_rate=16_000, return_tensors="pt"
    ).input_features[0]
    
    with processor.as_target_processor():
        batch["labels"] = processor(sample[2], return_tensors="pt").input_ids
    
    return batch

In [4]:
dataset = torchaudio.datasets.LIBRISPEECH(root="", url='dev-clean')
dataset = dataset.map(preprocessing)

In [28]:
dataset[0]

{'input_values': tensor([[-1.7439, -1.5824, -1.3852,  ..., -1.4764, -1.3188, -1.5373],
         [-1.7372, -1.5597, -1.5623,  ..., -1.3479, -1.4273, -1.6317],
         [-1.4334, -1.3532, -1.2806,  ..., -1.5108, -1.3358, -1.4236],
         ...,
         [-1.6064, -1.5294, -1.5514,  ..., -1.0002, -1.0575, -1.0322],
         [-1.7444, -1.6128, -1.4223,  ..., -1.0696, -0.9122, -1.0744],
         [-1.3830, -1.3087, -1.3786,  ..., -1.0823, -1.0664, -1.1232]]),
 'labels': tensor([[ 129, 8053,   66,   30,    4, 5878,    8,    4, 1080, 3353,    5,    6,
            52,   60,  534,    9, 1524,   20, 5517,    2]])}

In [29]:
class DataCollator(object):
    def __init__(self, processor: Speech2TextProcessor, padding=True, 
                 max_length=None, max_length_labels=None, 
                 pad_to_multiple_of=None, pad_to_multiple_of_labels=None):
        self.processor = processor
        self.padding = padding
        self.max_length = max_length
        self.max_length_labels = max_length_labels
        self.pad_to_multiple_of = pad_to_multiple_of
        self.pad_to_multiple_of_labels = pad_to_multiple_of_labels
    
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        
        batch = self.processor.pad(
            input_features, 
            padding=self.padding, 
            max_length=self.max_length, 
            pad_to_multiple_of=self.pad_to_multiple_of, 
            return_tensors="pt"
        )
        
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features, 
                padding=self.padding, 
                max_length=self.max_length, 
                pad_to_multiple_of=self.pad_to_multiple_of_labels, 
                return_tensors="pt"
            )
        
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        
        batch["labels"] = labels
        
        return batch

In [30]:
data_collator = DataCollator(processor, padding=True)

In [33]:
samples = [dataset[0], dataset[1]]

In [34]:
data_collator(samples)

AttributeError: 'Speech2TextProcessor' object has no attribute 'pad'