In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [None]:
!pip install datasets --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m481.3/491.4 kB[0m [31m18.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/193.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os

FILE_NAME = "LibriSeVoc-500"
ZIP_PATH = f"/content/drive/MyDrive/Datasets/{FILE_NAME}"

if not os.path.exists(f'/content/${FILE_NAME}'):
  !unzip {ZIP_PATH} -d /content/
else:
  print("Dataset already extracted.")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/LibriSeVoc-500/wavernn/8312_279791_000038_000005_gen.wav  
  inflating: /content/__MACOSX/LibriSeVoc-500/wavernn/._8312_279791_000038_000005_gen.wav  
  inflating: /content/LibriSeVoc-500/wavernn/7178_34644_000018_000000_gen.wav  
  inflating: /content/__MACOSX/LibriSeVoc-500/wavernn/._7178_34644_000018_000000_gen.wav  
  inflating: /content/LibriSeVoc-500/wavernn/5678_43301_000006_000000_gen.wav  
  inflating: /content/__MACOSX/LibriSeVoc-500/wavernn/._5678_43301_000006_000000_gen.wav  
  inflating: /content/LibriSeVoc-500/wavernn/3440_171009_000032_000000_gen.wav  
  inflating: /content/__MACOSX/LibriSeVoc-500/wavernn/._3440_171009_000032_000000_gen.wav  
  inflating: /content/LibriSeVoc-500/wavernn/7402_59171_000005_000008_gen.wav  
  inflating: /content/__MACOSX/LibriSeVoc-500/wavernn/._7402_59171_000005_000008_gen.wav  
  inflating: /content/LibriSeVoc-500/wavernn/7178_34644_000096_000000_gen.wa

In [None]:
# prompt: move /content/LibriSeVoc-500/__MACOSX and /content/LibriSeVoc-500/LibriSeVoc-500 outside

import shutil

# Move __MACOSX
source_path_macosx = "/content/LibriSeVoc-500/__MACOSX"
destination_path_macosx = "/content/__MACOSX"
if os.path.exists(source_path_macosx):
  shutil.move(source_path_macosx, destination_path_macosx)

# Move LibriSeVoc-500
source_path_libri = "/content/LibriSeVoc-500/LibriSeVoc-500"
destination_path_libri = "/content/LibriSeVoc-500_moved"
if os.path.exists(source_path_libri):
  shutil.move(source_path_libri, destination_path_libri)


In [None]:
# Step 3: Data Preparation and Loading (Adapted for Kaggle Structure)

import os
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, Audio, ClassLabel # Import ClassLabel
from pathlib import Path
import tensorflow as tf # Import tensorflow for dataset check later

# --- 1. Define Paths and Labels ---
# Adjust this path based on where Kaggle mounts the dataset
data_dir = Path(f'/content/{FILE_NAME}')

# Get class names from subdirectory names
class_names = sorted([d.name for d in data_dir.iterdir() if d.is_dir()])
num_labels = len(class_names)
label2id = {name: i for i, name in enumerate(class_names)}
id2label = {i: name for name, i in label2id.items()}

print(f"Found {num_labels} classes: {class_names}")
print(f"Label mapping: {label2id}")

# --- 2. Scan for Audio Files and Assign Labels ---
all_files = []
print("Scanning for audio files...")
for class_name in class_names:
    class_dir = data_dir / class_name
    # Using rglob to find files recursively, just in case
    for file_path in class_dir.rglob('*.wav'):
        # Extract a base identifier for splitting (important!)
        # Assuming filename format like 'speaker_chapter_segment_instance.wav'
        # We want to group by 'speaker_chapter_segment' to keep variations together
        base_filename = '_'.join(file_path.stem.split('_')[:3]) # Adjust if format differs

        all_files.append({
            'file_path': str(file_path),
            'label_id': label2id[class_name],
            'class_name': class_name,
            'base_filename': base_filename
        })
print(f"Found {len(all_files)} audio files.")

if not all_files:
    raise ValueError(f"No .wav files found in {data_dir}. Please check the path and dataset structure.")

all_files_df = pd.DataFrame(all_files)

# --- 3. Split Data (Stratified by Base Filename) ---
# This strategy ensures different versions (real/fake) of the same utterance
# stay within the same split (train, val, or test).
unique_base_filenames = all_files_df['base_filename'].unique()

# Split the unique base filenames first (e.g., 70% train, 15% val, 15% test)
train_base, test_val_base = train_test_split(unique_base_filenames, test_size=0.3, random_state=42)
val_base, test_base = train_test_split(test_val_base, test_size=0.5, random_state=42) # Split remaining 30% into 15%/15%

# Create the final data splits based on these base filenames
train_df = all_files_df[all_files_df['base_filename'].isin(train_base)].copy()
val_df = all_files_df[all_files_df['base_filename'].isin(val_base)].copy()
test_df = all_files_df[all_files_df['base_filename'].isin(test_base)].copy()

print(f"Train samples: {len(train_df)}, Validation samples: {len(val_df)}, Test samples: {len(test_df)}")
print(f"Example Train DF entry:\n{train_df.head()}")

# --- 4. Create Hugging Face DatasetDict ---
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
val_dataset = Dataset.from_pandas(val_df, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df, preserve_index=False)

# --- 5. Cast Audio Column and Set Class Labels ---
sampling_rate = 16000 # wav2vec 2.0 expects 16kHz
print(f"Casting audio column to {sampling_rate} Hz...")

# Define the ClassLabel feature using the discovered class names
class_label_feature = ClassLabel(names=class_names)

def cast_and_set_features(dataset):
    dataset = dataset.cast_column("file_path", Audio(sampling_rate=sampling_rate))
    # Important: Cast the label_id column to ClassLabel AFTER loading audio
    # This helps Hugging Face understand the label mapping
    features = dataset.features.copy()
    features['label_id'] = class_label_feature
    dataset = dataset.cast(features)
    # Rename columns for consistency
    dataset = dataset.rename_column("file_path", "audio")
    return dataset

train_dataset = cast_and_set_features(train_dataset)
val_dataset = cast_and_set_features(val_dataset)
test_dataset = cast_and_set_features(test_dataset)

librisevoc_dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

print("DatasetDict created successfully:")
print(librisevoc_dataset_dict)
print("\nExample Train entry features:")
print(librisevoc_dataset_dict['train'][0])

Found 7 classes: ['diffwave', 'gt', 'melgan', 'parallel_wave_gan', 'wavegrad', 'wavenet', 'wavernn']
Label mapping: {'diffwave': 0, 'gt': 1, 'melgan': 2, 'parallel_wave_gan': 3, 'wavegrad': 4, 'wavenet': 5, 'wavernn': 6}
Scanning for audio files...
Found 3500 audio files.
Train samples: 2458, Validation samples: 517, Test samples: 525
Example Train DF entry:
                                           file_path  label_id class_name  \
0  /content/LibriSeVoc-500/diffwave/460_172359_00...         0   diffwave   
1  /content/LibriSeVoc-500/diffwave/254_27760_000...         0   diffwave   
2  /content/LibriSeVoc-500/diffwave/5393_19219_00...         0   diffwave   
3  /content/LibriSeVoc-500/diffwave/7190_90543_00...         0   diffwave   
4  /content/LibriSeVoc-500/diffwave/5322_7678_000...         0   diffwave   

       base_filename  
0  460_172359_000020  
1   254_27760_000016  
2  5393_19219_000047  
3  7190_90543_000040  
4   5322_7678_000007  
Casting audio column to 16000 Hz...


Casting the dataset:   0%|          | 0/2458 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/517 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/525 [00:00<?, ? examples/s]

DatasetDict created successfully:
DatasetDict({
    train: Dataset({
        features: ['audio', 'label_id', 'class_name', 'base_filename'],
        num_rows: 2458
    })
    validation: Dataset({
        features: ['audio', 'label_id', 'class_name', 'base_filename'],
        num_rows: 517
    })
    test: Dataset({
        features: ['audio', 'label_id', 'class_name', 'base_filename'],
        num_rows: 525
    })
})

Example Train entry features:
{'audio': {'path': '/content/LibriSeVoc-500/diffwave/460_172359_000020_000000_gen.wav', 'array': array([ 0.00623519,  0.00286017,  0.00108559, ..., -0.00044471,
        0.00170136,  0.        ]), 'sampling_rate': 16000}, 'label_id': 0, 'class_name': 'diffwave', 'base_filename': '460_172359_000020'}


In [None]:
from transformers import AutoFeatureExtractor, TFAutoModelForAudioClassification
import tensorflow as tf

# Example: 7 classes (1 real + 6 vocoders)
num_labels = 7
model_checkpoint = "facebook/wav2vec2-base" # Or other variants like large

feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)

# Load TF model. Use from_pt=True if HF only has PyTorch weights for this checkpoint
try:
    model = TFAutoModelForAudioClassification.from_pretrained(
        model_checkpoint,
        num_labels=num_labels
    )
except OSError: # Handle case where only PyTorch weights are available
     print("No TF weights found, loading from PyTorch weights...")
     model = TFAutoModelForAudioClassification.from_pretrained(
        model_checkpoint,
        num_labels=num_labels,
        from_pt=True
    )

# Verify model summary (optional)
model.summary()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



No TF weights found, loading from PyTorch weights...


pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]


TFWav2Vec2ForSequenceClassification has backpropagation operations that are NOT supported on CPU. If you wish to train/fine-tune this model, you need a GPU or a TPU
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFWav2Vec2ForSequenceClassification: ['project_q.weight', 'project_hid.weight', 'quantizer.weight_proj.weight', 'quantizer.weight_proj.bias', 'project_q.bias', 'project_hid.bias', 'quantizer.codevectors']
- This IS expected if you are initializing TFWav2Vec2ForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFWav2Vec2ForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFWav2Ve

Model: "tf_wav2_vec2_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 wav2vec2 (TFWav2Vec2MainLa  multiple                  94371712  
 yer)                                                            
                                                                 
 projector (Dense)           multiple                  196864    
                                                                 
 classifier (Dense)          multiple                  1799      
                                                                 
Total params: 94570375 (360.76 MB)
Trainable params: 94570375 (360.76 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * 4.0),
        truncation=True,
        padding='max_length',
        return_attention_mask=True  # Add this line to get attention mask
    )
    # **Make sure 'input_values' and 'attention_mask' are present in the output:**
    return {
        'input_values': inputs.input_values,
        'attention_mask': inputs.attention_mask
    }

In [None]:
encoded_dataset = librisevoc_dataset_dict.map(
    preprocess_function,
    remove_columns=["audio"], # Remove original audio column
    batched=True
)
# Ensure label column is named 'labels' if your model expects it, otherwise adjust
# If your label column is 'label_id', you might not need renaming
# encoded_dataset = encoded_dataset.rename_column("label_id", "labels")

Map:   0%|          | 0/2458 [00:00<?, ? examples/s]

Map:   0%|          | 0/517 [00:00<?, ? examples/s]

Map:   0%|          | 0/525 [00:00<?, ? examples/s]

In [None]:
from transformers import DefaultDataCollator

# Use TF tensors for the collator
data_collator = DefaultDataCollator(return_tensors="tf")
batch_size = 16 # Adjust based on GPU memory

# Identify columns for the model input (usually 'input_values', 'attention_mask')
model_input_columns = ['input_values', 'attention_mask'] # Verify based on feature_extractor output

tf_train_dataset = encoded_dataset["train"].to_tf_dataset(
    columns=model_input_columns,
    label_cols=["label_id"], # Specify your label column name
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

tf_eval_dataset = encoded_dataset["validation"].to_tf_dataset(
    columns=model_input_columns,
    label_cols=["label_id"],
    shuffle=False, # No shuffle for validation/test
    batch_size=batch_size,
    collate_fn=data_collator,
)

tf_test_dataset = encoded_dataset["test"].to_tf_dataset(
    columns=model_input_columns,
    label_cols=["label_id"],
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


In [None]:
# Remove the previous AdamW imports

from transformers import create_optimizer # Import the HF utility
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import tensorflow as tf # Make sure tensorflow is imported

# --- Configuration for the optimizer ---
num_epochs = 3
# Ensure this batch_size matches what you use when creating the tf.data.Dataset later
# It's needed here to calculate the total training steps.
batch_size = 16
learning_rate = 3e-5

# Calculate the total number of training steps - REQUIRED for create_optimizer
try:
    # Need the size of the original HF dataset before converting to tf.data.Dataset
    num_train_examples = len(encoded_dataset["train"])
    num_train_steps = (num_train_examples // batch_size) * num_epochs
    # Optional: Define warmup steps (e.g., 10% of training steps)
    num_warmup_steps = int(num_train_steps * 0.1)
    print(f"Calculated training steps: {num_train_steps}, Warmup steps: {num_warmup_steps}")
except NameError:
     raise NameError("Please ensure 'encoded_dataset' (the mapped Hugging Face dataset)"
                     " and 'batch_size' are defined before this cell.")
except KeyError:
     raise KeyError("Please ensure 'encoded_dataset' has a 'train' split.")


# Create the optimizer and learning rate schedule using the HF utility
optimizer, schedule = create_optimizer(
    init_lr=learning_rate,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps
    # You can also specify weight_decay_rate here if needed, e.g., weight_decay_rate=0.01
)

# Compile the model using the optimizer created by the utility
print("Compiling model with optimizer from create_optimizer...")
model.compile(
    optimizer=optimizer,  # Use the optimizer instance from create_optimizer
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)
print("Model compiled successfully.")

# --- Now proceed to model.fit() ---
# The learning rate schedule created by `create_optimizer` will be applied automatically during training.
# history = model.fit(tf_train_dataset, validation_data=tf_eval_dataset, epochs=num_epochs)

Calculated training steps: 459, Warmup steps: 45
Compiling model with optimizer from create_optimizer...
Model compiled successfully.


In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers.keras_callbacks import PushToHubCallback
num_epochs = 3
batch_size = 16 # Adjust based on GPU memory
learning_rate = 3e-5
# Define output directory for checkpoints
checkpoint_output_dir = "wav2vec2_librisevoc_checkpoints" # Base directory
keras_checkpoint_filepath = os.path.join(checkpoint_output_dir, "best_model_keras.keras")
hf_callback_checkpoint_dir = os.path.join(checkpoint_output_dir, "best_model_hf_callback")

print("\nAttempting fallback training using PushToHubCallback...")
# Define a placeholder Hub model ID (required by callback structure)
# CHANGE THIS to your actual HF username and desired model name if you plan to push
hub_model_id = "ronanhansel/deepfake-audio-detector" # CHANGE THIS

push_to_hub_callback = PushToHubCallback(
    output_dir=hf_callback_checkpoint_dir, # Save to a different dir
    tokenizer=feature_extractor,
    hub_model_id=hub_model_id,
    save_strategy="epoch", # Save checkpoints every epoch
)
print(f"PushToHubCallback configured to save best model locally in: {push_to_hub_callback.output_dir}")

# Re-create optimizer and re-compile (safer after failed fit attempt)
optimizer, schedule = create_optimizer(
      init_lr=learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps
)
model.compile(
    optimizer=optimizer,
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

history = model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    epochs=num_epochs,
    callbacks=[push_to_hub_callback] # Use the HF callback
)
print("\nTraining finished using PushToHubCallback.")
successful_callback = "PushToHubCallback"
checkpoint_filepath_used = hf_callback_checkpoint_dir # Path where the best model is saved (directory)


Attempting fallback training using PushToHubCallback...


For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/content/wav2vec2_librisevoc_checkpoints/best_model_hf_callback is already a clone of https://huggingface.co/ronanhansel/deepfake-audio-detector. Make sure you pull the latest changes with `repo.git_pull()`.


PushToHubCallback configured to save best model locally in: wav2vec2_librisevoc_checkpoints/best_model_hf_callback
Epoch 1/3
Epoch 2/3
Epoch 3/3

Training finished using PushToHubCallback.


In [None]:
print("\n--- Evaluating Model ---")
# Load and evaluate the best model saved by the successful callback
best_model = None

print(f"\nAttempting to load and evaluate BEST model from checkpoint ({successful_callback}): {checkpoint_filepath_used}...")


print(f"(Loading from directory: {checkpoint_filepath_used})")
# HF callback saves in TF SavedModel format within the output_dir
best_model = TFAutoModelForAudioClassification.from_pretrained(checkpoint_filepath_used)
# Re-compile the loaded model as optimizer state might not be saved/loaded correctly this way
# Use a simple optimizer just for evaluation compilation
best_model.compile(loss=SparseCategoricalCrossentropy(from_logits=True), metrics=["accuracy"])

if best_model:
    print(f"Successfully loaded best model.")
    best_loss, best_accuracy = best_model.evaluate(tf_test_dataset)
    print(f"--- Best Model Test Loss: {best_loss:.4f}")
    print(f"--- Best Model Test Accuracy: {best_accuracy:.4f}")
else:
      print("Failed to load best model for evaluation.")




--- Evaluating Model ---

Evaluating final model state...



TFWav2Vec2ForSequenceClassification has backpropagation operations that are NOT supported on CPU. If you wish to train/fine-tune this model, you need a GPU or a TPU


Final Model Test Loss: 0.0511
Final Model Test Accuracy: 0.9867

Attempting to load and evaluate BEST model from checkpoint (PushToHubCallback): wav2vec2_librisevoc_checkpoints/best_model_hf_callback...
(Loading from directory: wav2vec2_librisevoc_checkpoints/best_model_hf_callback)


All model checkpoint layers were used when initializing TFWav2Vec2ForSequenceClassification.

All the layers of TFWav2Vec2ForSequenceClassification were initialized from the model checkpoint at wav2vec2_librisevoc_checkpoints/best_model_hf_callback.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFWav2Vec2ForSequenceClassification for predictions without further training.


Successfully loaded best model.
--- Best Model Test Loss: 0.0511
--- Best Model Test Accuracy: 0.9867


In [None]:
from huggingface_hub import HfApi
api = HfApi()

api.upload_large_folder(
    repo_id=hub_model_id,
    repo_type="model",
    folder_path=push_to_hub_callback.output_dir,
)

Recovering from metadata files:   0%|          | 0/5 [00:00<?, ?it/s]




---------- 2025-05-06 02:40:31 (0:00:00) ----------
Files:   hashed 0/5 (0.0/378.6M) | pre-uploaded: 0/0 (0.0/378.6M) (+5 unsure) | committed: 0/5 (0.0/378.6M) | ignored: 0
Workers: hashing: 2 | get upload mode: 0 | pre-uploading: 0 | committing: 0 | waiting: 0
---------------------------------------------------


No files have been modified since last commit. Skipping to prevent empty commit.


# Inferencing

In [None]:
import librosa
import numpy as np
import tensorflow as tf # Import TensorFlow

# Make sure id2label is defined from your training setup
# Example: id2label = {0: 'diffwave', 1: 'gt', ...}

def classify_audio(model, file_path, feature_extractor, id2label):
    """Classifies an audio file using the provided TF Wav2Vec2 model."""
    target_sr = 16000
    max_duration_s = 4.0
    max_length = int(target_sr * max_duration_s)

    try:
        # 1. Load and Resample Audio
        audio, sr = librosa.load(file_path, sr=target_sr) # Ensure loading at target SR

        # 2. Extract Features
        inputs = feature_extractor(
            audio,
            sampling_rate=feature_extractor.sampling_rate,
            max_length=max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors="np" # Get numpy arrays first
        )

        # 3. Prepare Tensors for TF Model
        # Ensure input_values is 1D before adding batch dim, then convert to TF Tensor
        input_values_np = inputs["input_values"].squeeze() # Remove any extra dims if present
        if input_values_np.ndim == 0: # Handle potential scalar case if audio is empty/too short after processing
             input_values_np = np.zeros(max_length) # Pad if necessary
        elif input_values_np.ndim > 1:
             print(f"Warning: input_values had unexpected ndim {input_values_np.ndim}, squeezing.")
             input_values_np = np.squeeze(input_values_np)


        input_values = tf.constant(np.expand_dims(input_values_np, axis=0), dtype=tf.float32)

        # Ensure attention_mask is 1D before adding batch dim, then convert to TF Tensor
        attention_mask_np = inputs["attention_mask"].squeeze()
        if attention_mask_np.ndim == 0:
             attention_mask_np = np.ones(max_length) # Pad if necessary
        elif attention_mask_np.ndim > 1:
             print(f"Warning: attention_mask had unexpected ndim {attention_mask_np.ndim}, squeezing.")
             attention_mask_np = np.squeeze(attention_mask_np)

        attention_mask = tf.constant(np.expand_dims(attention_mask_np, axis=0), dtype=tf.int32) # TF expects int32/64

        # --- Debugging: Check Shape ---
        print(f"Shape passed to model - input_values: {input_values.shape}, attention_mask: {attention_mask.shape}")
        # Expected shape: (1, 64000) for both

        # 4. Model Inference
        # Pass tensors to the model
        logits = model(input_values, attention_mask=attention_mask).logits

        # 5. Get Prediction
        predicted_class_id = int(tf.argmax(logits, axis=-1)[0].numpy())
        predicted_label = id2label.get(predicted_class_id, "Unknown") # Use .get for safety

        return predicted_label

    except Exception as e:
        print(f"Error classifying {file_path}: {e}")
        # You might want to print the full traceback for more detailed debugging
        import traceback
        traceback.print_exc()
        return None


test_dir = "./test/"
if not os.path.exists(test_dir):
    os.makedirs(test_dir)
    print(f"Created test directory: {test_dir}")
    # You might need to copy some test files into it here if it was empty
    # Example: !cp /path/to/your/test_audio.wav ./test/

print(f"\nClassifying files in {test_dir}...")
files_processed = 0
for filename in os.listdir(test_dir):
    if filename.lower().endswith(".wav"):
        file_path = os.path.join(test_dir, filename)
        print(f"\nProcessing: {filename}")
        predicted_label = classify_audio(best_model, file_path, feature_extractor, id2label)
        if predicted_label:
            print(f"File: {filename}, Predicted Class: {predicted_label}")
            files_processed += 1
        else:
            print(f"Failed to classify {filename}.")

if files_processed == 0:
    print(f"\nWarning: No .wav files were found or processed in {test_dir}.")


Classifying files in ./test/...

Processing: wavenet.wav
Shape passed to model - input_values: (1, 64000), attention_mask: (1, 64000)
File: wavenet.wav, Predicted Class: wavenet

Processing: diffwave.wav
Shape passed to model - input_values: (1, 64000), attention_mask: (1, 64000)
File: diffwave.wav, Predicted Class: diffwave

Processing: melgan.wav
Shape passed to model - input_values: (1, 64000), attention_mask: (1, 64000)
File: melgan.wav, Predicted Class: melgan

Processing: parallel_wave_gan.wav
Shape passed to model - input_values: (1, 64000), attention_mask: (1, 64000)
File: parallel_wave_gan.wav, Predicted Class: parallel_wave_gan

Processing: wavegrad.wav
Shape passed to model - input_values: (1, 64000), attention_mask: (1, 64000)
File: wavegrad.wav, Predicted Class: wavegrad

Processing: gt.wav
Shape passed to model - input_values: (1, 64000), attention_mask: (1, 64000)
File: gt.wav, Predicted Class: gt

Processing: wavernn.wav
Shape passed to model - input_values: (1, 64000)

In [None]:
!cp wavegrad.wav test/wavegrad.wav