huggingface · renet10 · Jun 13, 2025 · Jun 17, 2025 · Jun 17, 2025 · Jun 17, 2025
diff --git a/src/transformers/models/wav2vec2/processing_wav2vec2.py b/src/transformers/models/wav2vec2/processing_wav2vec2.py
@@ -81,10 +81,20 @@ def __call__(
         **kwargs: Unpack[Wav2Vec2ProcessorKwargs],
     ):
         """
-        This method forwards all its arguments to Wav2Vec2FeatureExtractor's
-        [`~Wav2Vec2FeatureExtractor.__call__`] and returns its output.
-        """
+        This method forwards all arguments to [`Wav2Vec2FeatureExtractor.__call__`] and/or
+        [`PreTrainedTokenizer.__call__`] depending on the input modality and returns their outputs. If both modalities are passed, [`Wav2Vec2FeatureExtractor.__call__`] and [`PreTrainedTokenizer.__call__`] are called.
+
+        Args:
+            audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
+                An audio input is passed to [`Wav2Vec2FeatureExtractor.__call__`].
+            text (`str`, `List[str]`, *optional*):
+                A text input is passed to [`PreTrainedTokenizer.__call__`].
+
+        [`~Wav2Vec2FeatureExtractor.__call__`] and PreTrainedTokenizer’s [`~PreTrainedTokenizer.__call__`] for more information.
-        [`~Wav2Vec2FeatureExtractor.__call__`] and PreTrainedTokenizer’s [`~PreTrainedTokenizer.__call__`] for more information.
-        [`~Wav2Vec2FeatureExtractor.__call__`] and PreTrainedTokenizer’s [`~PreTrainedTokenizer.__call__`] for more information.
 
+        Returns:
+            This method returns the results of each `call` method. If both are used, the output is a dictionary containing the results of both.
+        """
         if "raw_speech" in kwargs:
             warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
             audio = kwargs.pop("raw_speech")
@@ -121,8 +131,17 @@ def __call__(
 
     def pad(self, *args, **kwargs):
         """
-        This method forwards all its arguments to Wav2Vec2FeatureExtractor's
-        [`~Wav2Vec2FeatureExtractor.pad`] and returns its output.
+        This method operates on batches of extracted features and/or tokenized text. It forwards all arguments to
+        [`Wav2Vec2FeatureExtractor.pad`] and/or [`PreTrainedTokenizer.pad`] depending on the input modality and returns their outputs. If both modalities are passed, [`Wav2Vec2FeatureExtractor.pad`] and [`PreTrainedTokenizer.pad`] are called.
+
+        Args:
+            input_features:
+                When the first argument is a dictionary containing a batch of tensors, or the `input_features` argument is present, it is passed to [`Wav2Vec2FeatureExtractor.pad`].
+            labels:
+                When the `label` argument is present, it is passed to [`PreTrainedTokenizer.pad`].
+
+        Returns:
+            This method returns the results of each `pad` method. If both are used, the output is a dictionary containing the results of both.
         """
         # For backward compatibility
         if self._in_target_context_manager: