In [1]:
# pip install torch torchaudio xgboost scikit-learn matplotlib


In [2]:
import torchaudio
from IPython.display import Audio

audio_path = '/kaggle/input/avsspoof-2021/ASVspoof2021_PA_eval_part00/ASVspoof2021_PA_eval/flac/PA_E_1000001.flac'

# Load audio waveform and sample rate
waveform, sample_rate = torchaudio.load(audio_path)

# Play the audio
Audio(waveform, rate=sample_rate)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from torch.utils.data import Dataset, DataLoader
import random
import os
import numpy as np
from pathlib import Path

# Data Preprocessing - Mel Spectrogram and Log-Mel + Augmentation
def preprocess_audio(audio_path, sample_rate=16000, n_mels=128, time_mask=40, freq_mask=16, segment_duration=4):
    waveform, _ = torchaudio.load(audio_path, normalize=True)
    waveform = waveform.mean(dim=0).unsqueeze(0)  # Make mono if stereo
    
    num_frames = waveform.size(1)
    segment_length = int(segment_duration * sample_rate)
    if num_frames > segment_length:
        start_frame = random.randint(0, num_frames - segment_length)
        waveform = waveform[:, start_frame:start_frame + segment_length]

    mel_spec = torchaudio.transforms.MelSpectrogram(
        sample_rate=sample_rate,n_fft=800,hop_length=160,
    win_length=400, n_mels=n_mels,power=2.0)(waveform)
    
    log_mel_spec = torchaudio.transforms.AmplitudeToDB()(mel_spec)
    log_mel_spec = (log_mel_spec - log_mel_spec.mean()) / (log_mel_spec.std() + 1e-6)
    

    return log_mel_spec

def time_mask_augmentation(spec, max_frames=40):
    num_frames = spec.size(2)
    mask_length = random.randint(0, min(max_frames, num_frames))
    mask_start = random.randint(0, num_frames - mask_length)
    spec[:, :, mask_start:mask_start + mask_length] = 0
    return spec

def freq_mask_augmentation(spec, max_bands=16):
    num_freq = spec.size(1)
    mask_width = random.randint(0, min(max_bands, num_freq))
    mask_start = random.randint(0, num_freq - mask_width)
    spec[:, mask_start:mask_start + mask_width, :] = 0
    return spec



In [4]:
# Custom Dataset Class
class VoiceDataset(Dataset):
    def __init__(self, root: Path, split='train'):
        self.split = split
        self.paths = []
        real_dir = root / 'real' / self.split
        fake_dir = root / 'fake' / self.split
        
        real_files = []
        if real_dir.exists() and real_dir.is_dir():
            real_files = [(str(real_dir), file.name) for file in real_dir.glob('*.wav')]
        
        fake_files = []
        if fake_dir.exists() and fake_dir.is_dir():
            fake_files = [(str(fake_dir), file.name) for file in fake_dir.glob('*.wav')]

        self.paths = real_files + fake_files
        if self.paths:
            random.shuffle(self.paths)

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, i):
        dir_path_str, file_name = self.paths[i]
        file_path = Path(dir_path_str) / file_name
        label = 0 if 'real' in dir_path_str else 1
        mel_spec = preprocess_audio(file_path)
        if self.split=='train':
                mel_spec = time_mask_augmentation(mel_spec, 40)
                mel_spec = freq_mask_augmentation(mel_spec, 16)
            
        return mel_spec, label

In [None]:
import torch.nn as nn
import torch.nn.functional as F
class CNNModel(nn.Module):
    def __init__(self, in_channels=1, out_classes=2):
        super().__init__()
        self.conv_block = nn.Sequential(
            nn.Conv2d(in_channels, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2, 2)
        )
        self.pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
        )
        self.classifier = nn.Linear(64, out_classes)

    def forward(self, x):
        x = self.conv_block(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return self.classifier(x)
        
    def extract_feat(self, x):           
        x = self.conv_block(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x


In [None]:
from torch.optim import Adam
from sklearn.model_selection import train_test_split
import torch

root = Path('/kaggle/working/dataset')
train_dataset = VoiceDataset(root, 'train')
val_dataset   = VoiceDataset(root, 'val')

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Khởi tạo mô hình CNN
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNModel().to(device)

# Cài đặt optimizer và loss function
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)

# Huấn luyện mô hình CNN
best_acc = 0
for epoch in range(20):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for mel_spec, labels in train_loader:
        # print(labels)
        mel_spec, labels = mel_spec.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(mel_spec)
        # print(labels)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
        train_acc = correct / total


    print(f'Epoch [{epoch+1}/10], Loss: {total_loss/len(train_loader):.4f}, | Train Acc: {100*train_acc:.4f} ')
    
     # Đánh giá mô hình trên validation set (nếu cần)
    model.eval()  # Đặt mô hình ở chế độ evaluation
    with torch.no_grad():
        correct = 0
        total = 0
        loss = 0
        for mel_spec, labels in val_loader:
            mel_spec, labels = mel_spec.to(device), labels.to(device)  # Chuyển dữ liệu sang GPU
            outputs = model(mel_spec)
            loss += criterion(outputs, labels).item() * mel_spec.size(0)

            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        val_acc = 100*correct/total
                 
        print(f'Validation Accuracy: {100 * correct / total:.2f}%, Val Loss: {loss/total:.4f}')

        if val_acc > best_acc:
            best_acc = val_acc
            torch.save(model.state_dict(), 'best_model.pth')
            print(f"🔒 Saved best model at epoch {epoch+1} with val acc: {val_acc:.2f}%")
        

Epoch [1/10], Loss: 0.4095, | Train Acc: 80.5445 
Validation Accuracy: 60.80%, Val Loss: 1.1489
🔒 Saved best model at epoch 1 with val acc: 60.80%
Epoch [2/10], Loss: 0.2185, | Train Acc: 91.5341 
Validation Accuracy: 97.27%, Val Loss: 0.0941
🔒 Saved best model at epoch 2 with val acc: 97.27%
Epoch [3/10], Loss: 0.1708, | Train Acc: 93.5606 
Validation Accuracy: 84.25%, Val Loss: 0.3853
Epoch [4/10], Loss: 0.1324, | Train Acc: 95.2131 
Validation Accuracy: 87.99%, Val Loss: 0.2666
Epoch [5/10], Loss: 0.1376, | Train Acc: 94.8248 
Validation Accuracy: 98.05%, Val Loss: 0.0748
🔒 Saved best model at epoch 5 with val acc: 98.05%
Epoch [6/10], Loss: 0.1152, | Train Acc: 95.8144 
Validation Accuracy: 87.28%, Val Loss: 0.3641
Epoch [7/10], Loss: 0.0891, | Train Acc: 96.8703 
Validation Accuracy: 62.96%, Val Loss: 1.4280
Epoch [8/10], Loss: 0.0909, | Train Acc: 96.6383 
Validation Accuracy: 92.23%, Val Loss: 0.1791
Epoch [9/10], Loss: 0.0824, | Train Acc: 97.0360 
Validation Accuracy: 96.32%, 

In [8]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score

model.load_state_dict(torch.load('best_model.pth'))
model.to(device)
model.eval()

all_preds = []
all_labels = []
total_loss = 0
correct = 0
total = 0

with torch.no_grad():
    for mel_spec, labels in val_loader:
        mel_spec, labels = mel_spec.to(device), labels.to(device)
        outputs = model(mel_spec)
        total_loss += criterion(outputs, labels).item() * mel_spec.size(0)

        _, predicted = torch.max(outputs, 1)

        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        correct += (predicted == labels).sum().item()
        total += labels.size(0)

# Accuracy
ts_acc = 100 * correct / total
print(f'Validation Accuracy: {ts_acc:.2f}%, Val Loss: {total_loss/total:.4f}')

# Classification report
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, digits=4))


Validation Accuracy: 99.30%, Val Loss: 0.0191

Classification Report:
              precision    recall  f1-score   support

           0     0.9887    0.9973    0.9930      1848
           1     0.9973    0.9886    0.9929      1848

    accuracy                         0.9930      3696
   macro avg     0.9930    0.9930    0.9930      3696
weighted avg     0.9930    0.9930    0.9930      3696

