In [2]:

import pandas as pd
import torch
import torchaudio
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import (
    Wav2Vec2Model, 
    Wav2Vec2Processor, 
    AutoFeatureExtractor, 
    ASTForAudioClassification
)
import librosa
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from torchmetrics.classification import MultilabelAccuracy, MultilabelPrecision, MultilabelAveragePrecision
from tqdm import tqdm

# --- Đọc và chuẩn hóa dữ liệu ---
data = pd.read_csv("audio_only_classification_dataset.csv")
print(f"Số mẫu trong CSV: {len(data)}")
data['label'] = data['label'].apply(lambda x: [label.strip() for label in x.split(',')])  # Chuẩn hóa nhãn
print(data['label'].head())

mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(data['label'])
label_classes = mlb.classes_
print(f"labels shape: {labels.shape}")
print(f"label classes: {label_classes}")

audio_paths = data['audio_id'].values
train_audio_paths, val_audio_paths, train_labels, val_labels = train_test_split(
    audio_paths, labels, test_size=0.2, random_state=42
)
print(f"train_labels shape: {train_labels.shape}")
print(f"val_labels shape: {val_labels.shape}")
print(f"len(train_audio_paths): {len(train_audio_paths)}")
print(f"len(val_audio_paths): {len(val_audio_paths)}")
print(f"train_labels dtype: {train_labels.dtype}")
print(f"Sample train_labels[0]: {train_labels[0]}")

# Chuyển đổi nhãn sang float32
train_labels = train_labels.astype(np.float32)
val_labels = val_labels.astype(np.float32)
print(f"train_labels dtype after conversion: {train_labels.dtype}")

# --- Tiền xử lý âm thanh ---
def preprocess_audio(audio_path, sample_rate= 16000):
    y, sr = librosa.load(audio_path, sr=sample_rate)
    return y, sr

# --- Transform cho Wav2Vec2 ---
feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593", return_attention_mask=False)
def transform_waveform(audio_path):
    y, sr = preprocess_audio(audio_path)
    features = feature_extractor(y, sampling_rate=sr, return_tensors="pt", padding="max_length")
    input_values = features['input_values'].squeeze(0)  # [time_frames, frequency_bins] hoặc [1, time_frames, frequency_bins]
    return input_values

# --- Định nghĩa Dataset ---
class AudioMultilabelDataset(Dataset):
    def __init__(self, audio_paths, labels, sample_rate= 16000, max_length=5.0):
        self.audio_paths = audio_paths
        self.labels = labels
        self.sample_rate = sample_rate
        self.max_length = max_length
        # print(f"Dataset labels shape: {labels.shape}")
        assert len(audio_paths) == labels.shape[0], f"Số mẫu không khớp: {len(audio_paths)} vs {labels.shape[0]}"
        assert labels.shape[1] == len(label_classes), f"Số nhãn không khớp: {labels.shape[1]} vs {len(label_classes)}"

    def __len__(self):
        return len(self.audio_paths)

    def __getitem__(self, idx):
        audio_path = self.audio_paths[idx]
        label = self.labels[idx]
        waveform = transform_waveform(audio_path)
        
        labels_tensor = torch.FloatTensor(label).to(dtype=torch.float32)
        # print(f"Label dtype: {labels_tensor.dtype}")
        return {
            'input_values': waveform,
            'labels': labels_tensor
        }

# --- Tạo dataset ---
train_dataset = AudioMultilabelDataset(train_audio_paths, train_labels)
val_dataset = AudioMultilabelDataset(val_audio_paths, val_labels)

# --- Collate function ---
def collate_fn(batch):
    input_values = torch.stack([item['input_values'] for item in batch])
    labels = torch.stack([item['labels'] for item in batch])
    return {
        'input_values': input_values,
        'labels': labels
    }

# --- Tạo DataLoader ---
batch_size = 8  # Có thể thử batch_size=1 để debug
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# --- Khởi tạo mô hình, hàm mất mát, optimizer ---

from torch.cuda.amp import autocast, GradScaler
import torch.nn as nn
from torch.optim.lr_scheduler import ReduceLROnPlateau
import time
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ASTForAudioClassification.from_pretrained('MIT/ast-finetuned-audioset-10-10-0.4593', 
                            num_labels=len(label_classes),
                            problem_type="multi_label_classification",
                            ignore_mismatched_sizes=True).to(device)
criterion = nn.BCEWithLogitsLoss()  # Phù hợp cho phân loại đa nhãn

# Thiết lập optimizer với learning rate thích hợp và weight decay
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=0.01)

# Thêm learning rate scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1, verbose=True)

# Gradient accumulation cho batch size lớn hơn trên GPU nhỏ
accumulation_steps = 2

# Mixed precision training
scaler = torch.amp.GradScaler(device = device)

Số mẫu trong CSV: 8278
0    [1, 7, 8, 10, 11]
1               [8, 9]
2                 [10]
3             [10, 11]
4               [8, 9]
Name: label, dtype: object
labels shape: (8278, 18)
label classes: ['0' '1' '10' '11' '12' '13' '14' '15' '16' '17' '2' '3' '4' '5' '6' '7'
 '8' '9']
train_labels shape: (6622, 18)
val_labels shape: (1656, 18)
len(train_audio_paths): 6622
len(val_audio_paths): 1656
train_labels dtype: int64
Sample train_labels[0]: [0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
train_labels dtype after conversion: float32


Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([18]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([18, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
def compute_metrics(logits, labels, device):
    # Sử dụng torch.sigmoid để có giá trị dự đoán tốt hơn
    probs = torch.sigmoid(logits)
    binary_preds = (probs > 0.5).int()
    
    # Chuyển nhãn sang int32 cho torchmetrics
    labels = labels.to(dtype=torch.int32)
    
    # Tính toán metrics một lần trên batch
    acc = MultilabelAccuracy(num_labels=labels.shape[1], average='macro').to(device)
    mini_acc= MultilabelAccuracy(num_labels=labels.shape[1], average='macro').to(device)
    precision_metric = MultilabelPrecision(num_labels=labels.shape[1], average='macro').to(device)
    mAP = MultilabelAveragePrecision(num_labels=labels.shape[1]).to(device)
    
    acc_value = acc(binary_preds, labels).item()
    min_acc_value = mini_acc(binary_preds, labels).item()
    precision_value = precision_metric(binary_preds, labels).item()
    map_value = mAP(probs, labels).item()  # Sử dụng probs thay vì logits
    
    return {
        'accuracy': acc_value,
        'mini_accuracy': min_acc_value,
        'precision': precision_value,
        'mAP': map_value
    }

In [5]:
def compute_metrics(logits, labels, device):
    # Sử dụng torch.sigmoid để có giá trị dự đoán tốt hơn
    probs = torch.sigmoid(logits)
    binary_preds = (probs > 0.5).int()
    
    # Chuyển nhãn sang int32 cho torchmetrics
    labels = labels.to(dtype=torch.int32)
    
    # Tính toán metrics một lần trên batch
    acc = MultilabelAccuracy(num_labels=labels.shape[1], average='macro').to(device)
    mini_acc= MultilabelAccuracy(num_labels=labels.shape[1], average='macro').to(device)
    precision_metric = MultilabelPrecision(num_labels=labels.shape[1], average='macro').to(device)
    mAP = MultilabelAveragePrecision(num_labels=labels.shape[1]).to(device)
    
    acc_value = acc(binary_preds, labels).item()
    min_acc_value = mini_acc(binary_preds, labels).item()
    precision_value = precision_metric(binary_preds, labels).item()
    map_value = mAP(probs, labels).item()  # Sử dụng probs thay vì logits
    
    return {
        'accuracy': acc_value,
        'mini_accuracy': min_acc_value,
        'precision': precision_value,
        'mAP': map_value
    }
# Early stopping
best_val_map = 0
patience = 1
patience_counter = 0
early_stop = False

num_epochs = 3  # Tăng số epochs
for epoch in range(num_epochs):
    if early_stop:
        print("Early stopping triggered!")
        break
        
    # Training
    model.train()
    train_loss = 0.0
    train_metrics = {'accuracy': 0.0, 'mini_accuracy': 0.0, 'precision': 0.0, 'mAP': 0.0}
    num_batches = 0
    start_time = time.time()
    
    for i, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")):
        input_values = batch['input_values'].to(device)
        labels = batch['labels'].to(device)
        
        # Sử dụng mixed precision
        with torch.amp.autocast('cuda'):
            outputs = model(input_values)  # outputs là SequenceClassifierOutput
            logits = outputs.logits  # Lấy tensor logits [batch_size, num_labels]
            loss = criterion(logits, labels) / accumulation_steps
        
        # Gradient accumulation
        scaler.scale(loss).backward()
        
        if (i + 1) % accumulation_steps == 0 or (i + 1) == len(train_loader):
            # Gradient clipping để tránh exploding gradients
            scaler.unscale_(optimizer)
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
        
        train_loss += loss.item() * accumulation_steps
        
        # Tính metrics chỉ với một tỷ lệ nhất định các batch để tăng tốc
        if i % 5 == 0:  # Tính metrics mỗi 5 batch
            metrics = compute_metrics(logits, labels, device)
            for key in train_metrics:
                train_metrics[key] += metrics[key]
            num_batches += 1
    
    train_loss /= len(train_loader)
    for key in train_metrics:
        train_metrics[key] /= max(num_batches, 1)
    
    # Validation
    model.eval()
    val_loss = 0.0
    val_metrics = {'accuracy': 0.0, 'mini_accuracy': 0.0, 'precision': 0.0, 'mAP': 0.0}
    num_batches = 0
    all_logits = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation"):
            input_values = batch['input_values'].to(device)
            labels = batch['labels'].to(device)
            
            with torch.amp.autocast('cuda'):
                outputs = model(input_values)
                logits = outputs.logits
                loss = criterion(logits, labels)
            
            val_loss += loss.item()
            
            # Thu thập tất cả logits và labels để tính metrics một lần duy nhất
            all_logits.append(logits.detach())
            all_labels.append(labels.detach())
            num_batches += 1
    
    # Tính metrics một lần trên toàn bộ validation set
    all_logits = torch.cat(all_logits, dim=0)
    all_labels = torch.cat(all_labels, dim=0)
    val_metrics = compute_metrics(all_logits, all_labels, device)
    
    val_loss /= num_batches
    
    # Cập nhật learning rate dựa trên validation loss
    scheduler.step(val_loss)
    
    # Early stopping check
    if val_metrics['mAP'] > best_val_map:
        best_val_map = val_metrics['mAP']
        patience_counter = 0
        # Lưu model tốt nhất
        torch.save(model.state_dict(), 'best_model.pt')
        print("Saved best model!")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            early_stop = True
    
    epoch_time = time.time() - start_time
    print(f"Epoch {epoch+1}/{num_epochs}, Time: {epoch_time:.2f}s, LR: {optimizer.param_groups[0]['lr']:.6f}")
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_metrics['accuracy']:.4f},  Train Mini Accuracy: {train_metrics['mini_accuracy']:.4f}, ", 
          f"Train Precision: {train_metrics['precision']:.4f}, Train mAP: {train_metrics['mAP']:.4f}")
    print(f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_metrics['accuracy']:.4f},  Val Mini Accuracy: {val_metrics['mini_accuracy']:.4f}, "
          f"Val Precision: {val_metrics['precision']:.4f}, Val mAP: {val_metrics['mAP']:.4f}")

Epoch 1/3: 100%|██████████| 828/828 [29:45<00:00,  2.16s/it]
Validation: 100%|██████████| 207/207 [06:12<00:00,  1.80s/it]


Saved best model!
Epoch 1/3, Time: 2157.79s, LR: 0.000200
Train Loss: 0.3054, Train Accuracy: 0.8804,  Train Mini Accuracy: 0.8804,  Train Precision: 0.0266, Train mAP: 0.2198
Val Loss: 0.3055, Val Accuracy: 0.8769,  Val Mini Accuracy: 0.8769, Val Precision: 0.0300, Val mAP: 0.1379


Epoch 2/3:   3%|▎         | 22/828 [00:46<28:26,  2.12s/it]


KeyboardInterrupt: 

In [7]:
model

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ASTLayer(
          (attention): ASTSdpaAttention(
            (attention): ASTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
       