# Imports

In [1]:
!pip install librosa



In [3]:
!pip install torch



In [5]:
!pip install torchmetrics



In [16]:
import librosa
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import random
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import torch
import torchmetrics
import os
import torchvision.models as models
import warnings

In [18]:
warnings.filterwarnings('ignore')

In [20]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Config

In [23]:
class Config:
    SR = 32000
    N_MFCC = 13
    # Dataset
    ROOT_FOLDER = './'
    # Training
    N_CLASSES = 2
    BATCH_SIZE = 96
    N_EPOCHS = 10
    LR = 3e-4
    # Others
    SEED = 42
    
CONFIG = Config()

In [25]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CONFIG.SEED) # Seed 고정

In [None]:
import zipfile
import os

zip_path = 'open.zip'
extract_path = 'datasets'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [27]:
import os

# 변경할 디렉토리 경로를 지정합니다.
new_path = './datasets'

# 작업 디렉토리를 변경합니다.
os.chdir(new_path)

# 현재 작업 디렉토리를 확인합니다.
print("현재 작업 디렉토리:", os.getcwd())

현재 작업 디렉토리: C:\Users\yedin\generative-fake-voice-detector-ai\datasets


In [29]:
df = pd.read_csv('./train.csv')
train, val, _, _ = train_test_split(df, df['label'], test_size=0.2, random_state=CONFIG.SEED)

In [31]:
train

Unnamed: 0,id,path,label
6804,SNGJTJQG,./train/SNGJTJQG.ogg,fake
3734,LIYTDJZZ,./train/LIYTDJZZ.ogg,fake
55413,HAMPQOIN,./train/HAMPQOIN.ogg,real
10741,UCJMLYVH,./train/UCJMLYVH.ogg,fake
33027,EUKZRQPD,./train/EUKZRQPD.ogg,real
...,...,...,...
44732,PWTCAYUB,./train/PWTCAYUB.ogg,fake
54343,HSAHCTUQ,./train/HSAHCTUQ.ogg,real
38158,CXJSUSJK,./train/CXJSUSJK.ogg,fake
860,ZBTGTORE,./train/ZBTGTORE.ogg,real


In [33]:
# 데이터 불균형 확인
real_count = len(df[df['label'] == 'real'])
fake_count = len(df[df['label'] == 'fake'])
print(f"Real: {real_count}, Fake: {fake_count}")

Real: 27620, Fake: 27818


## Data Pre-processing : MFCC

In [36]:
# 현재 작업 디렉토리를 확인합니다.
current_directory = os.getcwd()
print("현재 작업 디렉토리:", current_directory)

현재 작업 디렉토리: C:\Users\yedin\generative-fake-voice-detector-ai\datasets


In [38]:
# 상대 경로를 절대 경로로 변환합니다.
relative_path = 'generative-fake-voice-detector-ai/datasets'
absolute_path = os.path.abspath(relative_path)

In [74]:
def get_mfcc_feature(df, train_mode=True):
    features = []
    labels = []
    for _, row in tqdm(df.iterrows()):
        y, sr = librosa.load(row['path'], sr=CONFIG.SR)
        if train_mode:
            y = augment_audio(y, sr)  # sr 추가
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CONFIG.N_MFCC)
        mfcc = np.mean(mfcc.T, axis=0)
        features.append(mfcc)
        if train_mode:
            label = row['label']
            label_vector = np.zeros(CONFIG.N_CLASSES, dtype=float)
            label_vector[0 if label == 'fake' else 1] = 1
            labels.append(label_vector)
    if train_mode:
        return features, labels
    return features

In [92]:
# 데이터 증강 함수
def augment_audio(y, sr):  # sr 추가
    aug_choice = random.choice(['time_stretch', 'pitch_shift', 'add_noise', 'shift'])
    if aug_choice == 'time_stretch':
        rate = random.uniform(0.8, 1.2)
        y = librosa.effects.time_stretch(y, rate=rate)  # rate를 키워드 인자로 전달
    elif aug_choice == 'pitch_shift':
        steps = random.randint(-5, 5)
        y = librosa.effects.pitch_shift(y, sr=sr, n_steps=steps)  # sr과 n_steps 추가
    elif aug_choice == 'add_noise':
        noise = np.random.randn(len(y))
        y = y + 0.005 * noise
    elif aug_choice == 'shift':
        shift = np.random.randint(len(y))
        y = np.roll(y, shift)
    return y

In [94]:
train_mfcc, train_labels = get_mfcc_feature(train, True)
val_mfcc, val_labels = get_mfcc_feature(val, True)

44350it [19:11, 38.52it/s]
11088it [2:48:56,  1.09it/s]


# Dataset

In [96]:
class CustomDataset(Dataset):
    def __init__(self, mfcc, label):
        self.mfcc = mfcc
        self.label = label

    def __len__(self):
        return len(self.mfcc)

    def __getitem__(self, index):
        if self.label is not None:
            # 차원을 (Batch, Channel, Height, Width) 형식으로 변경하고 채널을 3개로 복제
            mfcc = torch.tensor(self.mfcc[index], dtype=torch.float32).unsqueeze(0)
            mfcc = mfcc.expand(3, -1, -1)  # 채널을 3개로 복제
            return mfcc, torch.tensor(self.label[index], dtype=torch.float32)
        mfcc = torch.tensor(self.mfcc[index], dtype=torch.float32).unsqueeze(0)
        mfcc = mfcc.expand(3, -1, -1)  # 채널을 3개로 복제
        return mfcc

In [98]:
train_dataset = CustomDataset(train_mfcc, train_labels)
val_dataset = CustomDataset(val_mfcc, val_labels)

In [100]:
train_loader = DataLoader(
    train_dataset,
    batch_size=CONFIG.BATCH_SIZE,
    shuffle=True
)
val_loader = DataLoader(
    val_dataset,
    batch_size=CONFIG.BATCH_SIZE,
    shuffle=False
)

# Define Model

In [103]:
class ResNetTransformer(nn.Module):
    def __init__(self):
        super(ResNetTransformer, self).__init__()
        self.resnet = models.resnet101(pretrained=True)  # 더 깊은 ResNet-101 사용
        self.resnet.fc = nn.Identity()  # Fully connected layer 제거
        self.fc1 = nn.Linear(2048, 512)  # ResNet-101은 2048차원 출력
        self.transformer_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
        self.transformer = nn.TransformerEncoder(self.transformer_layer, num_layers=6)
        self.fc2 = nn.Linear(512, CONFIG.N_CLASSES)
        self.dropout = nn.Dropout(0.5)
        self.batchnorm = nn.BatchNorm1d(512)
        
    def forward(self, x):
        x = self.resnet(x)
        x = self.fc1(x)
        x = x.unsqueeze(1)  # (Batch, Channel, Features)
        x = self.transformer(x)
        x = x.mean(dim=1)  # Global average pooling
        x = self.batchnorm(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return torch.sigmoid(x)

In [105]:
# Focal Loss 정의
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.bce = nn.BCELoss(reduction='none')

    def forward(self, logits, targets):
        BCE_loss = self.bce(logits, targets)
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss
        return F_loss.mean()

criterion = FocalLoss(alpha=1, gamma=2)  # Focal Loss 사용

# Train & Validation

In [107]:
from sklearn.metrics import roc_auc_score

def train(model, optimizer, train_loader, val_loader, device):
    model.to(device)
    best_val_score = 0
    best_model = None
    for epoch in range(1, CONFIG.N_EPOCHS+1):
        model.train()
        train_loss = []
        for features, labels in tqdm(iter(train_loader)):
            features = features.float().to(device)
            labels = labels.float().to(device)
            optimizer.zero_grad()
            output = model(features)
            loss = criterion(output, labels)  # Focal Loss 적용
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())
        _val_loss, _val_score = validation(model, criterion, val_loader, device)
        _train_loss = np.mean(train_loss)
        print(f'Epoch [{epoch}], Train Loss : [{_train_loss:.5f}] Val Loss : [{_val_loss:.5f}] Val AUC : [{_val_score:.5f}]')
        if best_val_score < _val_score:
            best_val_score = _val_score
            best_model = model
    return best_model

def multiLabel_AUC(y_true, y_scores):
    auc_scores = []
    for i in range(y_true.shape[1]):
        auc = roc_auc_score(y_true[:, i], y_scores[:, i])
        auc_scores.append(auc)
    mean_auc_score = np.mean(auc_scores)
    return mean_auc_score
    
def validation(model, criterion, val_loader, device):
    model.eval()
    val_loss, all_labels, all_probs = [], [], []
    with torch.no_grad():
        for features, labels in tqdm(iter(val_loader)):
            features = features.float().to(device)
            labels = labels.float().to(device)
            probs = model(features)
            loss = criterion(probs, labels)
            val_loss.append(loss.item())
            all_labels.append(labels.cpu().numpy())
            all_probs.append(probs.cpu().numpy())
        _val_loss = np.mean(val_loss)
        all_labels = np.concatenate(all_labels, axis=0)
        all_probs = np.concatenate(all_probs, axis=0)
        auc_score = multiLabel_AUC(all_labels, all_probs)
    return _val_loss, auc_score

## Run

In [110]:
model = ResNetTransformer()
optimizer = torch.optim.Adam(params=model.parameters(), lr=CONFIG.LR)

infer_model = train(model, optimizer, train_loader, val_loader, device)

Downloading: "https://download.pytorch.org/models/resnet101-63fe2227.pth" to C:\Users\yedin/.cache\torch\hub\checkpoints\resnet101-63fe2227.pth
100%|███████████████████████████████████████████████████████████████████████████████| 171M/171M [00:10<00:00, 16.3MB/s]
100%|████████████████████████████████████████████████████████████████████████████████| 462/462 [06:02<00:00,  1.27it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 116/116 [00:17<00:00,  6.75it/s]


Epoch [1], Train Loss : [0.15565] Val Loss : [0.19130] Val AUC : [0.49657]


100%|████████████████████████████████████████████████████████████████████████████████| 462/462 [06:10<00:00,  1.25it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 116/116 [00:16<00:00,  7.08it/s]


Epoch [2], Train Loss : [0.17962] Val Loss : [0.17341] Val AUC : [0.55774]


100%|████████████████████████████████████████████████████████████████████████████████| 462/462 [06:40<00:00,  1.15it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 116/116 [00:17<00:00,  6.77it/s]


Epoch [3], Train Loss : [0.17500] Val Loss : [0.17334] Val AUC : [0.53486]


100%|████████████████████████████████████████████████████████████████████████████████| 462/462 [06:38<00:00,  1.16it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 116/116 [00:16<00:00,  7.04it/s]


Epoch [4], Train Loss : [0.17476] Val Loss : [0.17400] Val AUC : [0.46406]


100%|████████████████████████████████████████████████████████████████████████████████| 462/462 [06:46<00:00,  1.14it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 116/116 [00:16<00:00,  6.93it/s]


Epoch [5], Train Loss : [0.17493] Val Loss : [0.17353] Val AUC : [0.46277]


100%|████████████████████████████████████████████████████████████████████████████████| 462/462 [06:43<00:00,  1.14it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 116/116 [00:16<00:00,  7.14it/s]


Epoch [6], Train Loss : [0.17488] Val Loss : [0.17335] Val AUC : [0.52135]


100%|████████████████████████████████████████████████████████████████████████████████| 462/462 [06:40<00:00,  1.15it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 116/116 [00:17<00:00,  6.48it/s]


Epoch [7], Train Loss : [0.17503] Val Loss : [0.17339] Val AUC : [0.47563]


100%|████████████████████████████████████████████████████████████████████████████████| 462/462 [06:53<00:00,  1.12it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 116/116 [00:16<00:00,  6.91it/s]


Epoch [8], Train Loss : [0.17479] Val Loss : [0.17330] Val AUC : [0.48040]


100%|████████████████████████████████████████████████████████████████████████████████| 462/462 [06:42<00:00,  1.15it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 116/116 [00:16<00:00,  6.93it/s]


Epoch [9], Train Loss : [0.17490] Val Loss : [0.17354] Val AUC : [0.51815]


100%|████████████████████████████████████████████████████████████████████████████████| 462/462 [06:32<00:00,  1.18it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 116/116 [00:16<00:00,  6.96it/s]

Epoch [10], Train Loss : [0.17491] Val Loss : [0.17333] Val AUC : [0.47604]





## Inference

In [116]:
test = pd.read_csv('./test.csv')
test_mfcc = get_mfcc_feature(test, False)
test_dataset = CustomDataset(test_mfcc, None)
test_loader = DataLoader(
    test_dataset,
    batch_size=CONFIG.BATCH_SIZE,
    shuffle=False
)

50000it [12:13, 68.17it/s]


In [118]:
def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    predictions = []
    with torch.no_grad():
        for features in tqdm(iter(test_loader)):
            features = features.float().to(device)
            
            probs = model(features)

            probs  = probs.cpu().detach().numpy()
            predictions += probs.tolist()
    return predictions

In [120]:
preds = inference(infer_model, test_loader, device)

100%|████████████████████████████████████████████████████████████████████████████████| 521/521 [01:11<00:00,  7.30it/s]


## Submission

In [124]:
submit = pd.read_csv('./sample_submission_1.csv')
submit.iloc[:, 1:] = preds
submit.head()

Unnamed: 0,id,fake,real
0,TEST_00000,0.498192,0.501809
1,TEST_00001,0.498192,0.501809
2,TEST_00002,0.498192,0.501809
3,TEST_00003,0.498192,0.501809
4,TEST_00004,0.498192,0.501809


In [128]:
submit.to_csv('./baseline_submit_1.csv', index=False)