In [1]:
import torchaudio
import torch

def preprocess_audio(audio_path, sample_rate=16000, max_length=60.0):
    # Đọc tệp âm thanh
    waveform, sr = torchaudio.load(audio_path)
    
    # Resample nếu cần
    if sr != sample_rate:
        resampler = torchaudio.transforms.Resample(sr, sample_rate)
        waveform = resampler(waveform)
    
    # Chuyển thành mono nếu cần
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    
    # Cắt hoặc đệm để có độ dài cố định
    max_samples = int(max_length * sample_rate)
    if waveform.shape[1] > max_samples:
        waveform = waveform[:, :max_samples]
    else:
        padding = max_samples - waveform.shape[1]
        waveform = torch.nn.functional.pad(waveform, (0, padding))
    
    return waveform

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

# Đọc dữ liệu
data = pd.read_csv("audio_only_classification_dataset.csv")
data['label'] = data['label'].apply(lambda x: [label.strip() for label in x.split(',')])
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(data['label'])
label_classes = mlb.classes_
print(f"labels shape: {labels.shape}")  # Phải là [num_samples, num_labels]

audio_paths = data['audio_id'].values
train_audio_paths, val_audio_paths, train_labels, val_labels = train_test_split(
    audio_paths, labels, test_size=0.2, random_state=42
)
print(f"train_labels shape: {train_labels.shape}")
print(f"val_labels shape: {val_labels.shape}")
print(f"len(train_audio_paths): {len(train_audio_paths)}")
print(f"len(val_audio_paths): {len(val_audio_paths)}")

labels shape: (8278, 18)
train_labels shape: (6622, 18)
val_labels shape: (1656, 18)
len(train_audio_paths): 6622
len(val_audio_paths): 1656


In [3]:
from torch.utils.data import Dataset

class AudioMultilabelDataset(Dataset):
    def __init__(self, audio_paths, labels, transform=None, sample_rate=16000, max_length=5.0):
        self.audio_paths = audio_paths
        self.labels = labels
        self.transform = transform
        self.sample_rate = sample_rate
        self.max_length = max_length
        assert len(audio_paths) == labels.shape[0], f"Số mẫu không khớp: {len(audio_paths)} vs {labels.shape[0]}"
        assert labels.shape[1] == len(label_classes), f"Số nhãn không khớp: {labels.shape[1]} vs {len(label_classes)}"

    def __len__(self):
        return len(self.audio_paths)

    def __getitem__(self, idx):
        audio_path = self.audio_paths[idx]
        label = self.labels[idx]
        waveform = preprocess_audio(audio_path, self.sample_rate, self.max_length)
        if self.transform:
            waveform = self.transform(waveform)
        
        # Kiểm tra kích thước
        print(f"idx: {idx}, waveform shape: {waveform.shape}, label shape: {label.shape}")
        assert waveform.ndim == 1, f"Waveform phải là 1D, nhận được shape: {waveform.shape}"
        assert label.shape == (len(label_classes),), f"Label shape không đúng: {label.shape}"
        
        return {
            'input_values': waveform,
            'labels': torch.FloatTensor(label)
        }

In [13]:
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor

In [4]:
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor

# Khởi tạo processor và model
processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-base')
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    'facebook/wav2vec2-base',
    num_labels=len(label_classes),
    problem_type='multi_label_classification'
)

# Định nghĩa transform để xử lý waveform thành input cho Wav2Vec2
def transform_waveform(waveform):
    # Đảm bảo waveform là tensor 1D
    if waveform.ndim == 2:
        waveform = waveform.squeeze(0)
    assert waveform.ndim == 1, f"Waveform phải là 1D, nhận được shape: {waveform.shape}"
    
    inputs = processor(waveform, sampling_rate=16000, return_tensors='pt', padding=True)
    input_values = inputs['input_values'].squeeze(0)
    
    assert input_values.ndim == 1, f"input_values phải là 1D, nhận được shape: {input_values.shape}"
    return input_values

  from .autonotebook import tqdm as notebook_tqdm
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

audio_paths = data['audio_id'].values

# Chia dữ liệu thành tập huấn luyện và xác thực
train_audio_paths, val_audio_paths, train_labels, val_labels = train_test_split(
    audio_paths, labels, test_size=0.2, random_state=42
)

print(f"Số mẫu huấn luyện: {len(train_audio_paths)}")
print(f"Số mẫu xác thực: {len(val_audio_paths)}")

Số mẫu huấn luyện: 6622
Số mẫu xác thực: 1656


In [5]:
train_audio_paths, val_audio_paths, train_labels, val_labels = train_test_split(
    audio_paths, labels, test_size=0.2, random_state=42
)
assert len(train_audio_paths) == train_labels.shape[0], "train_audio_paths và train_labels không khớp"
assert len(val_audio_paths) == val_labels.shape[0], "val_audio_paths và val_labels không khớp"

In [2]:
import pandas as pd
import torch
import torchaudio
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from torchmetrics.classification import MultilabelAccuracy, MultilabelPrecision, MultilabelAveragePrecision

# --- Đọc và kiểm tra dữ liệu ---
data = pd.read_csv("audio_only_classification_dataset.csv")
print(f"Số mẫu trong CSV: {len(data)}")
data['label'] = data['label'].apply(lambda x: [label.strip() for label in x.split(',')])
print(data['label'].head())

mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(data['label'])
label_classes = mlb.classes_
print(f"labels shape: {labels.shape}")
print(f"label classes: {label_classes}")

audio_paths = data['audio_id'].values
train_audio_paths, val_audio_paths, train_labels, val_labels = train_test_split(
    audio_paths, labels, test_size=0.2, random_state=42
)
print(f"train_labels shape: {train_labels.shape}")
print(f"val_labels shape: {val_labels.shape}")
print(f"len(train_audio_paths): {len(train_audio_paths)}")
print(f"len(val_audio_paths): {len(val_audio_paths)}")
assert len(train_audio_paths) == train_labels.shape[0], "train_audio_paths và train_labels không khớp"
assert len(val_audio_paths) == val_labels.shape[0], "val_audio_paths và val_labels không khớp"

# --- Tiền xử lý âm thanh ---
def preprocess_audio(audio_path, sample_rate=16000, max_length=5.0):
    waveform, sr = torchaudio.load(audio_path)
    if sr != sample_rate:
        resampler = torchaudio.transforms.Resample(sr, sample_rate)
        waveform = resampler(waveform)
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    max_samples = int(max_length * sample_rate)
    if waveform.shape[1] > max_samples:
        waveform = waveform[:, :max_samples]
    else:
        padding = max_samples - waveform.shape[1]
        waveform = torch.nn.functional.pad(waveform, (0, padding))
    assert waveform.shape == (1, max_samples), f"Waveform shape không đúng: {waveform.shape}"
    return waveform

# --- Transform cho Wav2Vec2 ---
processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-base')
def transform_waveform(waveform):
    if waveform.ndim == 2:
        waveform = waveform.squeeze(0)
    assert waveform.ndim == 1, f"Waveform phải là 1D, nhận được shape: {waveform.shape}"
    inputs = processor(waveform, sampling_rate=16000, return_tensors='pt', padding=True)
    input_values = inputs['input_values'].squeeze()
    if input_values.ndim == 2:
        input_values = input_values[0]
    assert input_values.ndim == 1, f"input_values phải là 1D, nhận được shape: {input_values.shape}"
    return input_values

# --- Định nghĩa Dataset ---
class AudioMultilabelDataset(Dataset):
    def __init__(self, audio_paths, labels, transform=None, sample_rate=16000, max_length=5.0):
        self.audio_paths = audio_paths
        self.labels = labels
        self.transform = transform
        self.sample_rate = sample_rate
        self.max_length = max_length
        print(f"Dataset labels shape: {labels.shape}")
        assert len(audio_paths) == labels.shape[0], f"Số mẫu không khớp: {len(audio_paths)} vs {labels.shape[0]}"
        assert labels.shape[1] == len(label_classes), f"Số nhãn không khớp: {labels.shape[1]} vs {len(label_classes)}"

    def __len__(self):
        return len(self.audio_paths)

    def __getitem__(self, idx):
        audio_path = self.audio_paths[idx]
        label = self.labels[idx]
        waveform = preprocess_audio(audio_path, self.sample_rate, self.max_length)
        if self.transform:
            waveform = self.transform(waveform)

        # Ensure label is a float tensor
        label_tensor = torch.LongTensor(label)
        
        print(f"idx: {idx}, waveform shape: {waveform.shape}, label shape: {label_tensor.shape}")
        assert waveform.ndim == 1, f"Waveform shape không đúng: {waveform.shape}"
        assert label_tensor.shape == (len(label_classes),), f"Label shape không đúng: {label_tensor.shape}"
        
        return {
            'input_values': waveform,
            'labels': label_tensor
        }

# --- Tạo dataset ---
train_dataset = AudioMultilabelDataset(train_audio_paths, train_labels, transform=transform_waveform, max_length = 60)
val_dataset = AudioMultilabelDataset(val_audio_paths, val_labels, transform=transform_waveform, max_length = 660)

# --- Hàm compute_metrics ---
def compute_metrics(pred):
    logits = pred.predictions
    labels = pred.label_ids
    binary_preds = (logits > 0).astype(int)
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    logits_tensor = torch.tensor(logits, dtype=torch.float32).to(device)
    binary_preds_tensor = torch.tensor(binary_preds, dtype=torch.int32).to(device)
    labels_tensor = torch.tensor(labels, dtype=torch.int32).to(device)
    
    acc = MultilabelAccuracy(num_labels=labels.shape[1], average='macro').to(device)
    precision_metric = MultilabelPrecision(num_labels=labels.shape[1], average='macro').to(device)
    mAP = MultilabelAveragePrecision(num_labels=labels.shape[1]).to(device)
    
    acc_value = acc(binary_preds_tensor, labels_tensor).item()
    precision_value = precision_metric(binary_preds_tensor, labels_tensor).item()
    map_value = mAP(logits_tensor, labels_tensor).item()
    
    return {
        'accuracy': acc_value,
        'precision': precision_value,
        'mAP': map_value
    }
    
from transformers import Trainer, TrainingArguments
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    'facebook/wav2vec2-base',
    num_labels=len(label_classes),
    problem_type='multi_label_classification'
)
# Cấu hình huấn luyện
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_ratio=0.1,
    gradient_accumulation_steps=2,
    max_grad_norm=1.0,
    weight_decay=0.005,
    learning_rate=5e-4,
    logging_dir='./logs',
    logging_steps=50,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    greater_is_better=False,
    fp16=True,
    optim='adamw_torch',
    lr_scheduler_type='cosine',
    metric_for_best_model='eval_loss',
)


def collate_fn(batch):
    input_values = torch.stack([item['input_values'] for item in batch])
    labels = torch.stack([item['labels'] for item in batch])
    
    print(f"Batch input_values shape: {input_values.shape}")  # Expected: [8, num_samples]
    print(f"Batch labels shape: {labels.shape}")  # Expected: [8, num_classes]
    
    assert input_values.shape[0] == len(batch), f"Batch size input_values không đúng: {input_values.shape[0]}"
    assert labels.shape[0] == len(batch), f"Batch size labels không đúng: {labels.shape[0]}"
    assert labels.shape[1] == len(label_classes), f"Số nhãn không đúng: {labels.shape[1]} vs {len(label_classes)}"
    
    return {
        'input_values': input_values,
        'labels': labels
    }

    
    # Khởi tạo Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=collate_fn
)

# Huấn luyện
trainer.train()


Số mẫu trong CSV: 8278
0    [1, 7, 8, 10, 11]
1               [8, 9]
2                 [10]
3             [10, 11]
4               [8, 9]
Name: label, dtype: object
labels shape: (8278, 18)
label classes: ['0' '1' '10' '11' '12' '13' '14' '15' '16' '17' '2' '3' '4' '5' '6' '7'
 '8' '9']
train_labels shape: (6622, 18)
val_labels shape: (1656, 18)
len(train_audio_paths): 6622
len(val_audio_paths): 1656




Dataset labels shape: (6622, 18)
Dataset labels shape: (1656, 18)


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


idx: 1436, waveform shape: torch.Size([960000]), label shape: torch.Size([18])
idx: 2400, waveform shape: torch.Size([960000]), label shape: torch.Size([18])
idx: 4678, waveform shape: torch.Size([960000]), label shape: torch.Size([18])
idx: 85, waveform shape: torch.Size([960000]), label shape: torch.Size([18])
idx: 2512, waveform shape: torch.Size([960000]), label shape: torch.Size([18])
idx: 2746, waveform shape: torch.Size([960000]), label shape: torch.Size([18])
idx: 2850, waveform shape: torch.Size([960000]), label shape: torch.Size([18])
idx: 516, waveform shape: torch.Size([960000]), label shape: torch.Size([18])
Batch input_values shape: torch.Size([8, 960000])
Batch labels shape: torch.Size([8, 18])
idx: 4162, waveform shape: torch.Size([960000]), label shape: torch.Size([18])
idx: 3294, waveform shape: torch.Size([960000]), label shape: torch.Size([18])
idx: 4040, waveform shape: torch.Size([960000]), label shape: torch.Size([18])
idx: 4575, waveform shape: torch.Size([96000

ValueError: Expected input batch_size (8) to match target batch_size (144).

In [9]:
from transformers import Trainer, TrainingArguments

# Cấu hình huấn luyện
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_ratio=0.1,
    gradient_accumulation_steps=2,
    max_grad_norm=1.0,
    weight_decay=0.005,
    learning_rate=5e-4,
    logging_dir='./logs',
    logging_steps=50,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    greater_is_better=False,
    fp16=True,
    optim='adamw_torch',
    lr_scheduler_type='cosine',
    metric_for_best_model='eval_loss',
)


def collate_fn(batch):
    input_values = torch.stack([item['input_values'] for item in batch])
    labels = torch.stack([item['labels'] for item in batch])
    print(f"Batch input_values shape: {input_values.shape}")  # Kỳ vọng: [8, num_samples]
    print(f"Batch labels shape: {labels.shape}")  # Kỳ vọng: [8, 1818]
    assert input_values.shape[0] == len(batch), f"Batch size input_values không đúng: {input_values.shape[0]}"
    assert labels.shape[0] == len(batch), f"Batch size labels không đúng: {labels.shape[0]}"
    assert labels.shape[1] == len(label_classes), f"Số nhãn không đúng: {labels.shape[1]} vs {len(label_classes)}"
    return {
        'input_values': input_values,
        'labels': labels
    }
    
    # Khởi tạo Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=collate_fn
)

# Huấn luyện
trainer.train()


In [10]:
def collate_fn(batch):
    input_values = torch.stack([item['input_values'] for item in batch])
    labels = torch.stack([item['labels'] for item in batch])
    print(f"Batch input_values shape: {input_values.shape}")  # Kỳ vọng: [8, num_samples]
    print(f"Batch labels shape: {labels.shape}")  # Kỳ vọng: [8, 1818]
    assert input_values.shape[0] == len(batch), f"Batch size input_values không đúng: {input_values.shape[0]}"
    assert labels.shape[0] == len(batch), f"Batch size labels không đúng: {labels.shape[0]}"
    assert labels.shape[1] == len(label_classes), f"Số nhãn không đúng: {labels.shape[1]} vs {len(label_classes)}"
    return {
        'input_values': input_values,
        'labels': labels
    }
    
    # Khởi tạo Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=collate_fn
)

# Huấn luyện
trainer.train()

In [11]:
# Khởi tạo Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=collate_fn
)

# Huấn luyện
trainer.train()

idx: 1436, waveform shape: torch.Size([960000]), label shape: (18,)
idx: 2400, waveform shape: torch.Size([960000]), label shape: (18,)
idx: 4678, waveform shape: torch.Size([960000]), label shape: (18,)
idx: 85, waveform shape: torch.Size([960000]), label shape: (18,)
idx: 2512, waveform shape: torch.Size([960000]), label shape: (18,)
idx: 2746, waveform shape: torch.Size([960000]), label shape: (18,)
idx: 2850, waveform shape: torch.Size([960000]), label shape: (18,)
idx: 516, waveform shape: torch.Size([960000]), label shape: (18,)
Batch input_values shape: torch.Size([8, 960000])
Batch labels shape: torch.Size([8, 18])
idx: 4162, waveform shape: torch.Size([960000]), label shape: (18,)
idx: 3294, waveform shape: torch.Size([960000]), label shape: (18,)
idx: 4040, waveform shape: torch.Size([960000]), label shape: (18,)
idx: 4575, waveform shape: torch.Size([960000]), label shape: (18,)
idx: 5822, waveform shape: torch.Size([960000]), label shape: (18,)
idx: 2917, waveform shape: to

ValueError: Expected input batch_size (8) to match target batch_size (144).