In [1]:
import numpy as np
import pandas as pd
import os
import librosa
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import random
import warnings
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.calibration import calibration_curve

warnings.filterwarnings('ignore')

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

class Config:
    SR = 32000
    N_MFCC = 13
    ROOT_FOLDER = './'
    N_CLASSES = 2
    BATCH_SIZE = 96
    N_EPOCHS = 5
    LR = 3e-4
    SEED = 42

CONFIG = Config()

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CONFIG.SEED)

# 데이터 로드 및 전처리
df = pd.read_csv('./train.csv')

def label_encoder(column):
    le = LabelEncoder().fit(column)
    print(column.name, le.classes_)
    return le.transform(column)

df['class'] = label_encoder(df['label'])

# MFCC 특징 추출
def get_mfcc_feature(df, train_mode=True):
    features = []
    labels = []
    for _, row in tqdm(df.iterrows()):
        y, sr = librosa.load(row['path'], sr=CONFIG.SR)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CONFIG.N_MFCC)
        mfcc = np.mean(mfcc.T, axis=0)
        features.append(mfcc)

        if train_mode:
            label = row['class']
            labels.append(label)

    return features, labels

features, labels = get_mfcc_feature(df, True)
feature_df = pd.DataFrame({'features': features, 'class': labels})

X = np.array(feature_df['features'].tolist())
y = np.array(feature_df['class'].tolist())

# 데이터 불균형 해결
ros = RandomOverSampler(random_state=CONFIG.SEED)
X_resampled, y_resampled = ros.fit_resample(X, y)
y_resampled = torch.tensor(y_resampled).long()  # 정수형으로 변환
y_resampled = torch.nn.functional.one_hot(y_resampled, num_classes=CONFIG.N_CLASSES).float()

# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=CONFIG.SEED)

# PyTorch Dataset 클래스 정의
class CustomDataset(Dataset):
    def __init__(self, mfcc, label):
        self.mfcc = mfcc
        self.label = label

    def __len__(self):
        return len(self.mfcc)

    def __getitem__(self, index):
        return self.mfcc[index], self.label[index]

train_dataset = CustomDataset(X_train, y_train)
val_dataset = CustomDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=False)

# MLP 모델 정의
class MLP(nn.Module):
    def __init__(self, input_dim=CONFIG.N_MFCC, hidden_dims=[128, 256, 128], output_dim=CONFIG.N_CLASSES):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dims[0])
        self.fc2 = nn.Linear(hidden_dims[0], hidden_dims[1])
        self.fc3 = nn.Linear(hidden_dims[1], hidden_dims[2])
        self.fc4 = nn.Linear(hidden_dims[2], output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)
        return torch.sigmoid(x)

# 평가 함수 정의
def expected_calibration_error(y_true, y_prob, n_bins=10):
    prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=n_bins, strategy='uniform')
    bin_totals = np.histogram(y_prob, bins=np.linspace(0, 1, n_bins + 1), density=False)[0]
    non_empty_bins = bin_totals > 0
    bin_weights = bin_totals / len(y_prob)
    bin_weights = bin_weights[non_empty_bins]
    prob_true = prob_true[:len(bin_weights)]
    prob_pred = prob_pred[:len(bin_weights)]
    ece = np.sum(bin_weights * np.abs(prob_true - prob_pred))
    return ece

def auc_brier_ece(answer_df, submission_df):
    # Check for missing values in submission_df
    if submission_df.isnull().values.any():
        raise ValueError("The submission dataframe contains missing values.")

    # Check if the number and names of columns are the same in both dataframes
    if len(answer_df.columns) != len(submission_df.columns) or not all(answer_df.columns == submission_df.columns):
        raise ValueError("The columns of the answer and submission dataframes do not match.")
        
    submission_df = submission_df[submission_df.index.isin(answer_df.index)]
    submission_df.index = range(submission_df.shape[0])
    
    # Calculate AUC for each class
    auc_scores = []
    for column in answer_df.columns:
        y_true = answer_df[column]
        y_scores = submission_df[column]
        auc = roc_auc_score(y_true, y_scores)
        auc_scores.append(auc)

    # Calculate mean AUC
    mean_auc = np.mean(auc_scores)

    brier_scores = []
    ece_scores = []
    
    # Calculate Brier Score and ECE for each class
    for column in answer_df.columns:
        y_true = answer_df[column].values
        y_prob = submission_df[column].values
        
        # Brier Score
        brier = mean_squared_error(y_true, y_prob)
        brier_scores.append(brier)
        
        # ECE
        ece = expected_calibration_error(y_true, y_prob)
        ece_scores.append(ece)
    
    # Calculate mean Brier Score and mean ECE
    mean_brier = np.mean(brier_scores)
    mean_ece = np.mean(ece_scores)
    
    # Calculate combined score
    combined_score = 0.5 * (1 - mean_auc) + 0.25 * mean_brier + 0.25 * mean_ece
    
    return combined_score

# 모델 학습 함수 정의
def train(model, optimizer, train_loader, val_loader, device):
    model.to(device)
    criterion = nn.BCELoss().to(device)
    
    best_val_score = 0
    best_model = None
    val_labels = y_val.cpu().numpy()
    
    for epoch in range(1, CONFIG.N_EPOCHS+1):
        model.train()
        train_loss = []
        for features, labels in tqdm(iter(train_loader)):
            features = features.float().to(device)
            labels = labels.float().to(device)
            
            optimizer.zero_grad()
            
            output = model(features)
            loss = criterion(output, labels)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
                    
        _val_loss, _val_score, val_outputs = validation(model, criterion, val_loader, device)
        _train_loss = np.mean(train_loss)
        
        combined_score = auc_brier_ece(pd.DataFrame(val_labels), pd.DataFrame(val_outputs))
        
        print(f'Epoch [{epoch}], Train Loss : [{_train_loss:.5f}] Val Loss : [{_val_loss:.5f}] Val AUC : [{_val_score:.5f}] Combined Score: [{combined_score:.5f}]')
            
        if best_val_score < _val_score:
            best_val_score = _val_score
            best_model = model
    
    return best_model

def multiLabel_AUC(y_true, y_scores):
    auc_scores = []
    for i in range(y_true.shape[1]):
        auc = roc_auc_score(y_true[:, i], y_scores[:, i])
        auc_scores.append(auc)
    mean_auc_score = np.mean(auc_scores)
    return mean_auc_score
    
def validation(model, criterion, val_loader, device):
    model.eval()
    val_loss, all_labels, all_probs = [], [], []
    
    with torch.no_grad():
        for features, labels in tqdm(iter(val_loader)):
            features = features.float().to(device)
            labels = labels.float().to(device)
            
            probs = model(features)
            
            loss = criterion(probs, labels)

            val_loss.append(loss.item())

            all_labels.append(labels.cpu().numpy())
            all_probs.append(probs.cpu().numpy())
        
        _val_loss = np.mean(val_loss)

        all_labels = np.concatenate(all_labels, axis=0)
        all_probs = np.concatenate(all_probs, axis=0)
        
        auc_score = multiLabel_AUC(all_labels, all_probs)
    
    return _val_loss, auc_score, all_probs

model = MLP()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CONFIG.LR)

infer_model = train(model, optimizer, train_loader, val_loader, device)

# Unlabeled 데이터에 대해 pseudo-labeling
unlabeled_path = './unlabeled_data'
unlabeled_files = [os.path.join(unlabeled_path, f) for f in os.listdir(unlabeled_path) if f.endswith('.ogg')]

def get_mfcc_feature_from_files(file_paths):
    features = []
    for file_path in tqdm(file_paths):
        y, sr = librosa.load(file_path, sr=CONFIG.SR)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CONFIG.N_MFCC)
        mfcc = np.mean(mfcc.T, axis=0)
        features.append(mfcc)
    return features

unlabeled_features = get_mfcc_feature_from_files(unlabeled_files)
unlabeled_features = np.array(unlabeled_features)
unlabeled_dataset = CustomDataset(unlabeled_features, torch.zeros((len(unlabeled_features), CONFIG.N_CLASSES)))
unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=False)

def pseudo_labeling(model, loader, device):
    model.to(device)
    model.eval()
    pseudo_labels = []
    with torch.no_grad():
        for features, _ in tqdm(iter(loader)):
            features = features.float().to(device)
            probs = model(features)
            pseudo_labels.append(probs.cpu().detach().numpy())
    return np.concatenate(pseudo_labels, axis=0)

pseudo_labels = pseudo_labeling(infer_model, unlabeled_loader, device)
pseudo_labels = (pseudo_labels > 0.5).astype(int)  # 임계값 0.5를 사용하여 pseudo-label 생성

# Pseudo-labeled 데이터를 학습 데이터에 추가
pseudo_labeled_dataset = CustomDataset(unlabeled_features, torch.tensor(pseudo_labels).float())
train_dataset_combined = torch.utils.data.ConcatDataset([train_dataset, pseudo_labeled_dataset])
train_loader_combined = DataLoader(train_dataset_combined, batch_size=CONFIG.BATCH_SIZE, shuffle=True)

# 모델을 pseudo-labeled 데이터를 포함하여 재학습
infer_model = train(model, optimizer, train_loader_combined, val_loader, device)

# 테스트 데이터 예측
test = pd.read_csv('./test.csv')
test_features, _ = get_mfcc_feature(test, False)
test_features = np.array(test_features)
test_dataset = CustomDataset(test_features, torch.zeros((len(test_features), CONFIG.N_CLASSES)))
test_loader = DataLoader(test_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=False)

def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    predictions = []
    with torch.no_grad():
        for features, _ in tqdm(iter(test_loader)):
            features = features.float().to(device)
            probs = model(features)
            probs  = probs.cpu().detach().numpy()
            predictions += probs.tolist()
    return predictions

preds = inference(infer_model, test_loader, device)

submit = pd.read_csv('./sample_submission.csv')
submit.iloc[:, 1:] = preds
submit.head()

combined_score_before_test = auc_brier_ece(pd.DataFrame(y_val.cpu().numpy()), pd.DataFrame(preds))
print(f'Final Combined Score before test: {combined_score_before_test:.5f}')

submit.to_csv('./wy_origin.csv', index=False)


label ['fake' 'real']


55438it [23:41, 39.00it/s]
100%|██████████| 464/464 [00:06<00:00, 67.40it/s]
100%|██████████| 116/116 [00:00<00:00, 290.75it/s]


Epoch [1], Train Loss : [0.94270] Val Loss : [0.67075] Val AUC : [0.73635] Combined Score: [0.22962]


100%|██████████| 464/464 [00:07<00:00, 65.42it/s]
100%|██████████| 116/116 [00:00<00:00, 244.73it/s]


Epoch [2], Train Loss : [0.65205] Val Loss : [0.59030] Val AUC : [0.78640] Combined Score: [0.18217]


100%|██████████| 464/464 [00:07<00:00, 65.23it/s]
100%|██████████| 116/116 [00:00<00:00, 259.42it/s]


Epoch [3], Train Loss : [0.59321] Val Loss : [0.51249] Val AUC : [0.84822] Combined Score: [0.13870]


100%|██████████| 464/464 [00:06<00:00, 68.75it/s]
100%|██████████| 116/116 [00:00<00:00, 294.91it/s]


Epoch [4], Train Loss : [0.52001] Val Loss : [0.42587] Val AUC : [0.90458] Combined Score: [0.10156]


100%|██████████| 464/464 [00:07<00:00, 63.32it/s]
100%|██████████| 116/116 [00:00<00:00, 291.35it/s]


Epoch [5], Train Loss : [0.45694] Val Loss : [0.38388] Val AUC : [0.92793] Combined Score: [0.08886]


100%|██████████| 1264/1264 [00:36<00:00, 34.22it/s]
100%|██████████| 14/14 [00:00<00:00, 244.94it/s]
100%|██████████| 477/477 [00:06<00:00, 69.34it/s]
100%|██████████| 116/116 [00:00<00:00, 264.84it/s]


Epoch [1], Train Loss : [0.41048] Val Loss : [0.34715] Val AUC : [0.93900] Combined Score: [0.07658]


100%|██████████| 477/477 [00:07<00:00, 66.06it/s]
100%|██████████| 116/116 [00:00<00:00, 288.89it/s]


Epoch [2], Train Loss : [0.37720] Val Loss : [0.30946] Val AUC : [0.94859] Combined Score: [0.06571]


100%|██████████| 477/477 [00:07<00:00, 64.94it/s]
100%|██████████| 116/116 [00:00<00:00, 259.06it/s]


Epoch [3], Train Loss : [0.35390] Val Loss : [0.29682] Val AUC : [0.95573] Combined Score: [0.06273]


100%|██████████| 477/477 [00:06<00:00, 70.21it/s]
100%|██████████| 116/116 [00:00<00:00, 284.34it/s]


Epoch [4], Train Loss : [0.33200] Val Loss : [0.27937] Val AUC : [0.96103] Combined Score: [0.05734]


100%|██████████| 477/477 [00:07<00:00, 64.20it/s]
100%|██████████| 116/116 [00:00<00:00, 250.47it/s]


Epoch [5], Train Loss : [0.31640] Val Loss : [0.25648] Val AUC : [0.96589] Combined Score: [0.05019]


50000it [30:15, 27.54it/s]
100%|██████████| 521/521 [00:02<00:00, 177.81it/s]


Final Combined Score before test: 0.42355


In [2]:
import numpy as np
import pandas as pd
import os
import librosa
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import random
import warnings
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.calibration import calibration_curve

warnings.filterwarnings('ignore')

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

class Config:
    SR = 32000
    N_MFCC = 13
    ROOT_FOLDER = './'
    N_CLASSES = 2
    BATCH_SIZE = 64
    N_EPOCHS = 10
    LR = 3e-4
    SEED = 42

CONFIG = Config()

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CONFIG.SEED)

# 데이터 로드 및 전처리
df = pd.read_csv('./train.csv')

def label_encoder(column):
    le = LabelEncoder().fit(column)
    print(column.name, le.classes_)
    return le.transform(column)

df['class'] = label_encoder(df['label'])

# MFCC 특징 추출
def get_mfcc_feature(df, train_mode=True):
    features = []
    labels = []
    for _, row in tqdm(df.iterrows()):
        y, sr = librosa.load(row['path'], sr=CONFIG.SR)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CONFIG.N_MFCC)
        mfcc = np.mean(mfcc.T, axis=0)
        features.append(mfcc)

        if train_mode:
            label = row['class']
            labels.append(label)

    return features, labels

features, labels = get_mfcc_feature(df, True)
feature_df = pd.DataFrame({'features': features, 'class': labels})

X = np.array(feature_df['features'].tolist())
y = np.array(feature_df['class'].tolist())

# 데이터 불균형 해결
ros = RandomOverSampler(random_state=CONFIG.SEED)
X_resampled, y_resampled = ros.fit_resample(X, y)
y_resampled = torch.tensor(y_resampled).long()  # 정수형으로 변환
y_resampled = torch.nn.functional.one_hot(y_resampled, num_classes=CONFIG.N_CLASSES).float()

# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=CONFIG.SEED)

# PyTorch Dataset 클래스 정의
class CustomDataset(Dataset):
    def __init__(self, mfcc, label):
        self.mfcc = mfcc
        self.label = label

    def __len__(self):
        return len(self.mfcc)

    def __getitem__(self, index):
        return self.mfcc[index], self.label[index]

train_dataset = CustomDataset(X_train, y_train)
val_dataset = CustomDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=False)

# MLP 모델 정의
class MLP(nn.Module):
    def __init__(self, input_dim=CONFIG.N_MFCC, hidden_dims=[128, 256, 128], output_dim=CONFIG.N_CLASSES):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dims[0])
        self.fc2 = nn.Linear(hidden_dims[0], hidden_dims[1])
        self.fc3 = nn.Linear(hidden_dims[1], hidden_dims[2])
        self.fc4 = nn.Linear(hidden_dims[2], output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)
        return torch.sigmoid(x)

# 평가 함수 정의
def expected_calibration_error(y_true, y_prob, n_bins=10):
    prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=n_bins, strategy='uniform')
    bin_totals = np.histogram(y_prob, bins=np.linspace(0, 1, n_bins + 1), density=False)[0]
    non_empty_bins = bin_totals > 0
    bin_weights = bin_totals / len(y_prob)
    bin_weights = bin_weights[non_empty_bins]
    prob_true = prob_true[:len(bin_weights)]
    prob_pred = prob_pred[:len(bin_weights)]
    ece = np.sum(bin_weights * np.abs(prob_true - prob_pred))
    return ece

def auc_brier_ece(answer_df, submission_df):
    # Check for missing values in submission_df
    if submission_df.isnull().values.any():
        raise ValueError("The submission dataframe contains missing values.")

    # Check if the number and names of columns are the same in both dataframes
    if len(answer_df.columns) != len(submission_df.columns) or not all(answer_df.columns == submission_df.columns):
        raise ValueError("The columns of the answer and submission dataframes do not match.")
        
    submission_df = submission_df[submission_df.index.isin(answer_df.index)]
    submission_df.index = range(submission_df.shape[0])
    
    # Calculate AUC for each class
    auc_scores = []
    for column in answer_df.columns:
        y_true = answer_df[column]
        y_scores = submission_df[column]
        auc = roc_auc_score(y_true, y_scores)
        auc_scores.append(auc)

    # Calculate mean AUC
    mean_auc = np.mean(auc_scores)

    brier_scores = []
    ece_scores = []
    
    # Calculate Brier Score and ECE for each class
    for column in answer_df.columns:
        y_true = answer_df[column].values
        y_prob = submission_df[column].values
        
        # Brier Score
        brier = mean_squared_error(y_true, y_prob)
        brier_scores.append(brier)
        
        # ECE
        ece = expected_calibration_error(y_true, y_prob)
        ece_scores.append(ece)
    
    # Calculate mean Brier Score and mean ECE
    mean_brier = np.mean(brier_scores)
    mean_ece = np.mean(ece_scores)
    
    # Calculate combined score
    combined_score = 0.5 * (1 - mean_auc) + 0.25 * mean_brier + 0.25 * mean_ece
    
    return combined_score

# 모델 학습 함수 정의
def train(model, optimizer, train_loader, val_loader, device):
    model.to(device)
    criterion = nn.BCELoss().to(device)
    
    best_val_score = 0
    best_model = None
    val_labels = y_val.cpu().numpy()
    
    for epoch in range(1, CONFIG.N_EPOCHS+1):
        model.train()
        train_loss = []
        for features, labels in tqdm(iter(train_loader)):
            features = features.float().to(device)
            labels = labels.float().to(device)
            
            optimizer.zero_grad()
            
            output = model(features)
            loss = criterion(output, labels)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
                    
        _val_loss, _val_score, val_outputs = validation(model, criterion, val_loader, device)
        _train_loss = np.mean(train_loss)
        
        combined_score = auc_brier_ece(pd.DataFrame(val_labels), pd.DataFrame(val_outputs))
        
        print(f'Epoch [{epoch}], Train Loss : [{_train_loss:.5f}] Val Loss : [{_val_loss:.5f}] Val AUC : [{_val_score:.5f}] Combined Score: [{combined_score:.5f}]')
            
        if best_val_score < _val_score:
            best_val_score = _val_score
            best_model = model
    
    return best_model

def multiLabel_AUC(y_true, y_scores):
    auc_scores = []
    for i in range(y_true.shape[1]):
        auc = roc_auc_score(y_true[:, i], y_scores[:, i])
        auc_scores.append(auc)
    mean_auc_score = np.mean(auc_scores)
    return mean_auc_score
    
def validation(model, criterion, val_loader, device):
    model.eval()
    val_loss, all_labels, all_probs = [], [], []
    
    with torch.no_grad():
        for features, labels in tqdm(iter(val_loader)):
            features = features.float().to(device)
            labels = labels.float().to(device)
            
            probs = model(features)
            
            loss = criterion(probs, labels)

            val_loss.append(loss.item())

            all_labels.append(labels.cpu().numpy())
            all_probs.append(probs.cpu().numpy())
        
        _val_loss = np.mean(val_loss)

        all_labels = np.concatenate(all_labels, axis=0)
        all_probs = np.concatenate(all_probs, axis=0)
        
        auc_score = multiLabel_AUC(all_labels, all_probs)
    
    return _val_loss, auc_score, all_probs

model = MLP()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CONFIG.LR)

infer_model = train(model, optimizer, train_loader, val_loader, device)

# Unlabeled 데이터에 대해 pseudo-labeling
unlabeled_path = './unlabeled_data'
unlabeled_files = [os.path.join(unlabeled_path, f) for f in os.listdir(unlabeled_path) if f.endswith('.ogg')]

def get_mfcc_feature_from_files(file_paths):
    features = []
    for file_path in tqdm(file_paths):
        y, sr = librosa.load(file_path, sr=CONFIG.SR)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CONFIG.N_MFCC)
        mfcc = np.mean(mfcc.T, axis=0)
        features.append(mfcc)
    return features

unlabeled_features = get_mfcc_feature_from_files(unlabeled_files)
unlabeled_features = np.array(unlabeled_features)
unlabeled_dataset = CustomDataset(unlabeled_features, torch.zeros((len(unlabeled_features), CONFIG.N_CLASSES)))
unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=False)

def pseudo_labeling(model, loader, device):
    model.to(device)
    model.eval()
    pseudo_labels = []
    with torch.no_grad():
        for features, _ in tqdm(iter(loader)):
            features = features.float().to(device)
            probs = model(features)
            pseudo_labels.append(probs.cpu().detach().numpy())
    return np.concatenate(pseudo_labels, axis=0)

pseudo_labels = pseudo_labeling(infer_model, unlabeled_loader, device)
pseudo_labels = (pseudo_labels > 0.5).astype(int)  # 임계값 0.5를 사용하여 pseudo-label 생성

# Pseudo-labeled 데이터를 학습 데이터에 추가
pseudo_labeled_dataset = CustomDataset(unlabeled_features, torch.tensor(pseudo_labels).float())
train_dataset_combined = torch.utils.data.ConcatDataset([train_dataset, pseudo_labeled_dataset])
train_loader_combined = DataLoader(train_dataset_combined, batch_size=CONFIG.BATCH_SIZE, shuffle=True)

# 모델을 pseudo-labeled 데이터를 포함하여 재학습
infer_model = train(model, optimizer, train_loader_combined, val_loader, device)

# 테스트 데이터 예측
test = pd.read_csv('./test.csv')
test_features, _ = get_mfcc_feature(test, False)
test_features = np.array(test_features)
test_dataset = CustomDataset(test_features, torch.zeros((len(test_features), CONFIG.N_CLASSES)))
test_loader = DataLoader(test_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=False)

def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    predictions = []
    with torch.no_grad():
        for features, _ in tqdm(iter(test_loader)):
            features = features.float().to(device)
            probs = model(features)
            probs  = probs.cpu().detach().numpy()
            predictions += probs.tolist()
    return predictions

preds = inference(infer_model, test_loader, device)

submit = pd.read_csv('./sample_submission.csv')
submit.iloc[:, 1:] = preds
submit.head()

combined_score_before_test = auc_brier_ece(pd.DataFrame(y_val.cpu().numpy()), pd.DataFrame(preds))
print(f'Final Combined Score before test: {combined_score_before_test:.5f}')

submit.to_csv('./wy_origin2.csv', index=False)


label ['fake' 'real']


55438it [28:55, 31.95it/s]
100%|██████████| 696/696 [00:11<00:00, 62.21it/s]
100%|██████████| 174/174 [00:00<00:00, 274.90it/s]


Epoch [1], Train Loss : [0.87780] Val Loss : [0.65412] Val AUC : [0.74489] Combined Score: [0.21925]


100%|██████████| 696/696 [00:11<00:00, 58.07it/s]
100%|██████████| 174/174 [00:00<00:00, 320.93it/s]


Epoch [2], Train Loss : [0.63537] Val Loss : [0.56339] Val AUC : [0.81182] Combined Score: [0.16542]


100%|██████████| 696/696 [00:12<00:00, 57.16it/s]
100%|██████████| 174/174 [00:00<00:00, 285.29it/s]


Epoch [3], Train Loss : [0.55121] Val Loss : [0.44588] Val AUC : [0.88725] Combined Score: [0.10770]


100%|██████████| 696/696 [00:12<00:00, 56.73it/s]
100%|██████████| 174/174 [00:00<00:00, 299.55it/s]


Epoch [4], Train Loss : [0.46634] Val Loss : [0.37568] Val AUC : [0.92543] Combined Score: [0.08540]


100%|██████████| 696/696 [00:12<00:00, 56.30it/s]
100%|██████████| 174/174 [00:00<00:00, 290.09it/s]


Epoch [5], Train Loss : [0.41052] Val Loss : [0.33201] Val AUC : [0.94245] Combined Score: [0.07279]


100%|██████████| 696/696 [00:11<00:00, 59.04it/s]
100%|██████████| 174/174 [00:00<00:00, 303.99it/s]


Epoch [6], Train Loss : [0.37403] Val Loss : [0.30222] Val AUC : [0.95167] Combined Score: [0.06280]


100%|██████████| 696/696 [00:12<00:00, 57.01it/s]
100%|██████████| 174/174 [00:00<00:00, 283.82it/s]


Epoch [7], Train Loss : [0.34988] Val Loss : [0.28586] Val AUC : [0.95751] Combined Score: [0.05857]


100%|██████████| 696/696 [00:11<00:00, 59.21it/s]
100%|██████████| 174/174 [00:00<00:00, 306.28it/s]


Epoch [8], Train Loss : [0.32653] Val Loss : [0.25715] Val AUC : [0.96418] Combined Score: [0.04965]


100%|██████████| 696/696 [00:11<00:00, 58.83it/s]
100%|██████████| 174/174 [00:00<00:00, 265.25it/s]


Epoch [9], Train Loss : [0.31062] Val Loss : [0.24741] Val AUC : [0.96846] Combined Score: [0.04870]


100%|██████████| 696/696 [00:11<00:00, 58.82it/s]
100%|██████████| 174/174 [00:00<00:00, 338.73it/s]


Epoch [10], Train Loss : [0.29382] Val Loss : [0.23366] Val AUC : [0.97143] Combined Score: [0.04462]


100%|██████████| 1264/1264 [00:44<00:00, 28.68it/s]
100%|██████████| 20/20 [00:00<00:00, 322.53it/s]
100%|██████████| 716/716 [00:11<00:00, 59.69it/s]
100%|██████████| 174/174 [00:00<00:00, 317.87it/s]


Epoch [1], Train Loss : [0.27828] Val Loss : [0.22648] Val AUC : [0.97458] Combined Score: [0.04370]


100%|██████████| 716/716 [00:12<00:00, 58.04it/s]
100%|██████████| 174/174 [00:00<00:00, 320.97it/s]


Epoch [2], Train Loss : [0.26739] Val Loss : [0.22028] Val AUC : [0.97755] Combined Score: [0.04311]


100%|██████████| 716/716 [00:12<00:00, 59.29it/s]
100%|██████████| 174/174 [00:00<00:00, 328.66it/s]


Epoch [3], Train Loss : [0.25693] Val Loss : [0.20918] Val AUC : [0.97926] Combined Score: [0.04036]


100%|██████████| 716/716 [00:12<00:00, 59.33it/s]
100%|██████████| 174/174 [00:00<00:00, 288.27it/s]


Epoch [4], Train Loss : [0.24997] Val Loss : [0.20056] Val AUC : [0.98116] Combined Score: [0.03831]


100%|██████████| 716/716 [00:12<00:00, 58.76it/s]
100%|██████████| 174/174 [00:00<00:00, 294.57it/s]


Epoch [5], Train Loss : [0.24129] Val Loss : [0.19078] Val AUC : [0.98282] Combined Score: [0.03564]


100%|██████████| 716/716 [00:12<00:00, 58.57it/s]
100%|██████████| 174/174 [00:00<00:00, 261.85it/s]


Epoch [6], Train Loss : [0.23482] Val Loss : [0.18649] Val AUC : [0.98319] Combined Score: [0.03370]


100%|██████████| 716/716 [00:12<00:00, 57.23it/s]
100%|██████████| 174/174 [00:00<00:00, 284.96it/s]


Epoch [7], Train Loss : [0.22982] Val Loss : [0.18193] Val AUC : [0.98460] Combined Score: [0.03361]


100%|██████████| 716/716 [00:12<00:00, 59.22it/s]
100%|██████████| 174/174 [00:00<00:00, 287.63it/s]


Epoch [8], Train Loss : [0.22343] Val Loss : [0.17411] Val AUC : [0.98517] Combined Score: [0.03117]


100%|██████████| 716/716 [00:12<00:00, 59.50it/s]
100%|██████████| 174/174 [00:00<00:00, 315.23it/s]


Epoch [9], Train Loss : [0.21432] Val Loss : [0.17081] Val AUC : [0.98645] Combined Score: [0.03060]


100%|██████████| 716/716 [00:12<00:00, 59.04it/s]
100%|██████████| 174/174 [00:00<00:00, 326.12it/s]


Epoch [10], Train Loss : [0.21156] Val Loss : [0.17021] Val AUC : [0.98680] Combined Score: [0.03117]


50000it [26:21, 31.61it/s]
100%|██████████| 782/782 [00:02<00:00, 368.50it/s]


Final Combined Score before test: 0.43873


In [3]:
import numpy as np
import pandas as pd
import os
import librosa
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import random
import warnings
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.calibration import calibration_curve

warnings.filterwarnings('ignore')

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

class Config:
    SR = 32000
    N_MFCC = 13
    ROOT_FOLDER = './'
    N_CLASSES = 2
    BATCH_SIZE = 64
    N_EPOCHS = 20
    LR = 3e-4
    SEED = 42
    DROPOUT_RATE = 0.5
    HIDDEN_DIMS = [128, 256, 128]
    INITIALIZATION = 'xavier'
    ACTIVATION_FUNCTION = 'relu'

CONFIG = Config()

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CONFIG.SEED)

# 데이터 로드 및 전처리
df = pd.read_csv('./train.csv')

def label_encoder(column):
    le = LabelEncoder().fit(column)
    print(column.name, le.classes_)
    return le.transform(column)

df['class'] = label_encoder(df['label'])

# MFCC 특징 추출
def get_mfcc_feature(df, train_mode=True):
    features = []
    labels = []
    for _, row in tqdm(df.iterrows()):
        y, sr = librosa.load(row['path'], sr=CONFIG.SR)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CONFIG.N_MFCC)
        mfcc = np.mean(mfcc.T, axis=0)
        features.append(mfcc)

        if train_mode:
            label = row['class']
            labels.append(label)

    return features, labels

features, labels = get_mfcc_feature(df, True)
feature_df = pd.DataFrame({'features': features, 'class': labels})

X = np.array(feature_df['features'].tolist())
y = np.array(feature_df['class'].tolist())

# 데이터 불균형 해결
ros = RandomOverSampler(random_state=CONFIG.SEED)
X_resampled, y_resampled = ros.fit_resample(X, y)
y_resampled = torch.tensor(y_resampled).long()  # 정수형으로 변환
y_resampled = torch.nn.functional.one_hot(y_resampled, num_classes=CONFIG.N_CLASSES).float()

# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=CONFIG.SEED)

# PyTorch Dataset 클래스 정의
class CustomDataset(Dataset):
    def __init__(self, mfcc, label):
        self.mfcc = mfcc
        self.label = label

    def __len__(self):
        return len(self.mfcc)

    def __getitem__(self, index):
        return self.mfcc[index], self.label[index]

train_dataset = CustomDataset(X_train, y_train)
val_dataset = CustomDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=False)

# MLP 모델 정의
class MLP(nn.Module):
    def __init__(self, input_dim=CONFIG.N_MFCC, hidden_dims=CONFIG.HIDDEN_DIMS, output_dim=CONFIG.N_CLASSES):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dims[0])
        self.fc2 = nn.Linear(hidden_dims[0], hidden_dims[1])
        self.fc3 = nn.Linear(hidden_dims[1], hidden_dims[2])
        self.fc4 = nn.Linear(hidden_dims[2], output_dim)
        
        if CONFIG.INITIALIZATION == 'xavier':
            nn.init.xavier_uniform_(self.fc1.weight)
            nn.init.xavier_uniform_(self.fc2.weight)
            nn.init.xavier_uniform_(self.fc3.weight)
            nn.init.xavier_uniform_(self.fc4.weight)
        
        self.activation = nn.ReLU() if CONFIG.ACTIVATION_FUNCTION == 'relu' else nn.Tanh()
        self.dropout = nn.Dropout(CONFIG.DROPOUT_RATE)

    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.dropout(x)
        x = self.activation(self.fc2(x))
        x = self.dropout(x)
        x = self.activation(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)
        return torch.sigmoid(x)

# 평가 함수 정의
def expected_calibration_error(y_true, y_prob, n_bins=10):
    prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=n_bins, strategy='uniform')
    bin_totals = np.histogram(y_prob, bins=np.linspace(0, 1, n_bins + 1), density=False)[0]
    non_empty_bins = bin_totals > 0
    bin_weights = bin_totals / len(y_prob)
    bin_weights = bin_weights[non_empty_bins]
    prob_true = prob_true[:len(bin_weights)]
    prob_pred = prob_pred[:len(bin_weights)]
    ece = np.sum(bin_weights * np.abs(prob_true - prob_pred))
    return ece

def auc_brier_ece(answer_df, submission_df):
    # Check for missing values in submission_df
    if submission_df.isnull().values.any():
        raise ValueError("The submission dataframe contains missing values.")

    # Check if the number and names of columns are the same in both dataframes
    if len(answer_df.columns) != len(submission_df.columns) or not all(answer_df.columns == submission_df.columns):
        raise ValueError("The columns of the answer and submission dataframes do not match.")
        
    submission_df = submission_df[submission_df.index.isin(answer_df.index)]
    submission_df.index = range(submission_df.shape[0])
    
    # Calculate AUC for each class
    auc_scores = []
    for column in answer_df.columns:
        y_true = answer_df[column]
        y_scores = submission_df[column]
        auc = roc_auc_score(y_true, y_scores)
        auc_scores.append(auc)

    # Calculate mean AUC
    mean_auc = np.mean(auc_scores)

    brier_scores = []
    ece_scores = []
    
    # Calculate Brier Score and ECE for each class
    for column in answer_df.columns:
        y_true = answer_df[column].values
        y_prob = submission_df[column].values
        
        # Brier Score
        brier = mean_squared_error(y_true, y_prob)
        brier_scores.append(brier)
        
        # ECE
        ece = expected_calibration_error(y_true, y_prob)
        ece_scores.append(ece)
    
    # Calculate mean Brier Score and mean ECE
    mean_brier = np.mean(brier_scores)
    mean_ece = np.mean(ece_scores)
    
    # Calculate combined score
    combined_score = 0.5 * (1 - mean_auc) + 0.25 * mean_brier + 0.25 * mean_ece
    
    return combined_score

# 모델 학습 함수 정의
def train(model, optimizer, train_loader, val_loader, device):
    model.to(device)
    criterion = nn.BCELoss().to(device)
    
    best_val_score = 0
    best_model = None
    val_labels = y_val.cpu().numpy()
    
    for epoch in range(1, CONFIG.N_EPOCHS+1):
        model.train()
        train_loss = []
        for features, labels in tqdm(iter(train_loader)):
            features = features.float().to(device)
            labels = labels.float().to(device)
            
            optimizer.zero_grad()
            
            output = model(features)
            loss = criterion(output, labels)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
                    
        _val_loss, _val_score, val_outputs = validation(model, criterion, val_loader, device)
        _train_loss = np.mean(train_loss)
        
        combined_score = auc_brier_ece(pd.DataFrame(val_labels), pd.DataFrame(val_outputs))
        
        print(f'Epoch [{epoch}], Train Loss : [{_train_loss:.5f}] Val Loss : [{_val_loss:.5f}] Val AUC : [{_val_score:.5f}] Combined Score: [{combined_score:.5f}]')
            
        if best_val_score < _val_score:
            best_val_score = _val_score
            best_model = model
    
    return best_model

def multiLabel_AUC(y_true, y_scores):
    auc_scores = []
    for i in range(y_true.shape[1]):
        auc = roc_auc_score(y_true[:, i], y_scores[:, i])
        auc_scores.append(auc)
    mean_auc_score = np.mean(auc_scores)
    return mean_auc_score
    
def validation(model, criterion, val_loader, device):
    model.eval()
    val_loss, all_labels, all_probs = [], [], []
    
    with torch.no_grad():
        for features, labels in tqdm(iter(val_loader)):
            features = features.float().to(device)
            labels = labels.float().to(device)
            
            probs = model(features)
            
            loss = criterion(probs, labels)

            val_loss.append(loss.item())

            all_labels.append(labels.cpu().numpy())
            all_probs.append(probs.cpu().numpy())
        
        _val_loss = np.mean(val_loss)

        all_labels = np.concatenate(all_labels, axis=0)
        all_probs = np.concatenate(all_probs, axis=0)
        
        auc_score = multiLabel_AUC(all_labels, all_probs)
    
    return _val_loss, auc_score, all_probs

model = MLP()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CONFIG.LR)

infer_model = train(model, optimizer, train_loader, val_loader, device)

# Unlabeled 데이터에 대해 pseudo-labeling
unlabeled_path = './unlabeled_data'
unlabeled_files = [os.path.join(unlabeled_path, f) for f in os.listdir(unlabeled_path) if f.endswith('.ogg')]

def get_mfcc_feature_from_files(file_paths):
    features = []
    for file_path in tqdm(file_paths):
        y, sr = librosa.load(file_path, sr=CONFIG.SR)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CONFIG.N_MFCC)
        mfcc = np.mean(mfcc.T, axis=0)
        features.append(mfcc)
    return features

unlabeled_features = get_mfcc_feature_from_files(unlabeled_files)
unlabeled_features = np.array(unlabeled_features)
unlabeled_dataset = CustomDataset(unlabeled_features, torch.zeros((len(unlabeled_features), CONFIG.N_CLASSES)))
unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=False)

def pseudo_labeling(model, loader, device):
    model.to(device)
    model.eval()
    pseudo_labels = []
    with torch.no_grad():
        for features, _ in tqdm(iter(loader)):
            features = features.float().to(device)
            probs = model(features)
            pseudo_labels.append(probs.cpu().detach().numpy())
    return np.concatenate(pseudo_labels, axis=0)

pseudo_labels = pseudo_labeling(infer_model, unlabeled_loader, device)
pseudo_labels = (pseudo_labels > 0.5).astype(int)  # 임계값 0.5를 사용하여 pseudo-label 생성

# Pseudo-labeled 데이터를 학습 데이터에 추가
pseudo_labeled_dataset = CustomDataset(unlabeled_features, torch.tensor(pseudo_labels).float())
train_dataset_combined = torch.utils.data.ConcatDataset([train_dataset, pseudo_labeled_dataset])
train_loader_combined = DataLoader(train_dataset_combined, batch_size=CONFIG.BATCH_SIZE, shuffle=True)

# 모델을 pseudo-labeled 데이터를 포함하여 재학습
infer_model = train(model, optimizer, train_loader_combined, val_loader, device)

# 테스트 데이터 예측
test = pd.read_csv('./test.csv')
test_features, _ = get_mfcc_feature(test, False)
test_features = np.array(test_features)
test_dataset = CustomDataset(test_features, torch.zeros((len(test_features), CONFIG.N_CLASSES)))
test_loader = DataLoader(test_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=False)

def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    predictions = []
    with torch.no_grad():
        for features, _ in tqdm(iter(test_loader)):
            features = features.float().to(device)
            probs = model(features)
            probs  = probs.cpu().detach().numpy()
            predictions += probs.tolist()
    return predictions

preds = inference(infer_model, test_loader, device)

submit = pd.read_csv('./sample_submission.csv')
submit.iloc[:, 1:] = preds
submit.head()

combined_score_before_test = auc_brier_ece(pd.DataFrame(y_val.cpu().numpy()), pd.DataFrame(preds))
print(f'Final Combined Score before test: {combined_score_before_test:.5f}')

submit.to_csv('./wy_origin3.csv', index=False)


label ['fake' 'real']


55438it [24:16, 38.07it/s]
100%|██████████| 696/696 [00:09<00:00, 70.02it/s]
100%|██████████| 174/174 [00:00<00:00, 334.88it/s]


Epoch [1], Train Loss : [13.76120] Val Loss : [0.68369] Val AUC : [0.68417] Combined Score: [0.24231]


100%|██████████| 696/696 [00:09<00:00, 70.22it/s]
100%|██████████| 174/174 [00:00<00:00, 382.34it/s]


Epoch [2], Train Loss : [0.76569] Val Loss : [0.67165] Val AUC : [0.77531] Combined Score: [0.22073]


100%|██████████| 696/696 [00:10<00:00, 65.38it/s]
100%|██████████| 174/174 [00:00<00:00, 296.04it/s]


Epoch [3], Train Loss : [0.65150] Val Loss : [0.58006] Val AUC : [0.81407] Combined Score: [0.17394]


100%|██████████| 696/696 [00:11<00:00, 60.57it/s]
100%|██████████| 174/174 [00:00<00:00, 379.42it/s]


Epoch [4], Train Loss : [0.58384] Val Loss : [0.47576] Val AUC : [0.88562] Combined Score: [0.12220]


100%|██████████| 696/696 [00:11<00:00, 62.40it/s]
100%|██████████| 174/174 [00:00<00:00, 339.01it/s]


Epoch [5], Train Loss : [0.49719] Val Loss : [0.38398] Val AUC : [0.92688] Combined Score: [0.08830]


100%|██████████| 696/696 [00:10<00:00, 63.88it/s]
100%|██████████| 174/174 [00:00<00:00, 307.75it/s]


Epoch [6], Train Loss : [0.43591] Val Loss : [0.34149] Val AUC : [0.94184] Combined Score: [0.07572]


100%|██████████| 696/696 [00:10<00:00, 63.76it/s]
100%|██████████| 174/174 [00:00<00:00, 352.20it/s]


Epoch [7], Train Loss : [0.39083] Val Loss : [0.31077] Val AUC : [0.94933] Combined Score: [0.06575]


100%|██████████| 696/696 [00:10<00:00, 64.92it/s]
100%|██████████| 174/174 [00:00<00:00, 348.29it/s]


Epoch [8], Train Loss : [0.36146] Val Loss : [0.29272] Val AUC : [0.95634] Combined Score: [0.06228]


100%|██████████| 696/696 [00:11<00:00, 60.24it/s]
100%|██████████| 174/174 [00:00<00:00, 280.22it/s]


Epoch [9], Train Loss : [0.34131] Val Loss : [0.28096] Val AUC : [0.95814] Combined Score: [0.05738]


100%|██████████| 696/696 [00:10<00:00, 64.20it/s]
100%|██████████| 174/174 [00:00<00:00, 319.81it/s]


Epoch [10], Train Loss : [0.32129] Val Loss : [0.26162] Val AUC : [0.96514] Combined Score: [0.05213]


100%|██████████| 696/696 [00:11<00:00, 62.94it/s]
100%|██████████| 174/174 [00:00<00:00, 346.28it/s]


Epoch [11], Train Loss : [0.30790] Val Loss : [0.24941] Val AUC : [0.96847] Combined Score: [0.04938]


100%|██████████| 696/696 [00:11<00:00, 60.11it/s]
100%|██████████| 174/174 [00:00<00:00, 381.21it/s]


Epoch [12], Train Loss : [0.29026] Val Loss : [0.24346] Val AUC : [0.97440] Combined Score: [0.05032]


100%|██████████| 696/696 [00:10<00:00, 64.03it/s]
100%|██████████| 174/174 [00:00<00:00, 390.49it/s]


Epoch [13], Train Loss : [0.28270] Val Loss : [0.23494] Val AUC : [0.97466] Combined Score: [0.04653]


100%|██████████| 696/696 [00:10<00:00, 64.39it/s]
100%|██████████| 174/174 [00:00<00:00, 306.65it/s]


Epoch [14], Train Loss : [0.26548] Val Loss : [0.22550] Val AUC : [0.97814] Combined Score: [0.04516]


100%|██████████| 696/696 [00:11<00:00, 61.73it/s]
100%|██████████| 174/174 [00:00<00:00, 356.08it/s]


Epoch [15], Train Loss : [0.25931] Val Loss : [0.21376] Val AUC : [0.98015] Combined Score: [0.04245]


100%|██████████| 696/696 [00:10<00:00, 64.41it/s]
100%|██████████| 174/174 [00:00<00:00, 353.20it/s]


Epoch [16], Train Loss : [0.24903] Val Loss : [0.21477] Val AUC : [0.98101] Combined Score: [0.04249]


100%|██████████| 696/696 [00:10<00:00, 65.69it/s]
100%|██████████| 174/174 [00:00<00:00, 357.63it/s]


Epoch [17], Train Loss : [0.24253] Val Loss : [0.21053] Val AUC : [0.98187] Combined Score: [0.04202]


100%|██████████| 696/696 [00:11<00:00, 62.47it/s]
100%|██████████| 174/174 [00:00<00:00, 388.18it/s]


Epoch [18], Train Loss : [0.23560] Val Loss : [0.21224] Val AUC : [0.98318] Combined Score: [0.04330]


100%|██████████| 696/696 [00:10<00:00, 65.99it/s]
100%|██████████| 174/174 [00:00<00:00, 350.91it/s]


Epoch [19], Train Loss : [0.23001] Val Loss : [0.20410] Val AUC : [0.98349] Combined Score: [0.03935]


100%|██████████| 696/696 [00:10<00:00, 65.49it/s]
100%|██████████| 174/174 [00:00<00:00, 368.10it/s]


Epoch [20], Train Loss : [0.22663] Val Loss : [0.18932] Val AUC : [0.98558] Combined Score: [0.03675]


100%|██████████| 1264/1264 [00:37<00:00, 33.74it/s]
100%|██████████| 20/20 [00:00<00:00, 333.33it/s]
100%|██████████| 716/716 [00:11<00:00, 61.62it/s]
100%|██████████| 174/174 [00:00<00:00, 350.40it/s]


Epoch [1], Train Loss : [0.21994] Val Loss : [0.19883] Val AUC : [0.98386] Combined Score: [0.03820]


100%|██████████| 716/716 [00:11<00:00, 63.71it/s]
100%|██████████| 174/174 [00:00<00:00, 358.15it/s]


Epoch [2], Train Loss : [0.21446] Val Loss : [0.18575] Val AUC : [0.98705] Combined Score: [0.03661]


100%|██████████| 716/716 [00:11<00:00, 60.29it/s]
100%|██████████| 174/174 [00:00<00:00, 348.83it/s]


Epoch [3], Train Loss : [0.21128] Val Loss : [0.18172] Val AUC : [0.98725] Combined Score: [0.03538]


100%|██████████| 716/716 [00:11<00:00, 63.26it/s]
100%|██████████| 174/174 [00:00<00:00, 358.78it/s]


Epoch [4], Train Loss : [0.20394] Val Loss : [0.19185] Val AUC : [0.98587] Combined Score: [0.03767]


100%|██████████| 716/716 [00:11<00:00, 62.08it/s]
100%|██████████| 174/174 [00:00<00:00, 370.14it/s]


Epoch [5], Train Loss : [0.20456] Val Loss : [0.18776] Val AUC : [0.98770] Combined Score: [0.03714]


100%|██████████| 716/716 [00:12<00:00, 58.93it/s]
100%|██████████| 174/174 [00:00<00:00, 390.05it/s]


Epoch [6], Train Loss : [0.20011] Val Loss : [0.17681] Val AUC : [0.98797] Combined Score: [0.03449]


100%|██████████| 716/716 [00:11<00:00, 62.88it/s]
100%|██████████| 174/174 [00:00<00:00, 287.21it/s]


Epoch [7], Train Loss : [0.19806] Val Loss : [0.17749] Val AUC : [0.98845] Combined Score: [0.03418]


100%|██████████| 716/716 [00:11<00:00, 62.37it/s]
100%|██████████| 174/174 [00:00<00:00, 323.28it/s]


Epoch [8], Train Loss : [0.19395] Val Loss : [0.17574] Val AUC : [0.98730] Combined Score: [0.03308]


100%|██████████| 716/716 [00:11<00:00, 60.88it/s]
100%|██████████| 174/174 [00:00<00:00, 387.69it/s]


Epoch [9], Train Loss : [0.18994] Val Loss : [0.16444] Val AUC : [0.98917] Combined Score: [0.03118]


100%|██████████| 716/716 [00:11<00:00, 62.78it/s]
100%|██████████| 174/174 [00:00<00:00, 388.94it/s]


Epoch [10], Train Loss : [0.19236] Val Loss : [0.17345] Val AUC : [0.98833] Combined Score: [0.03333]


100%|██████████| 716/716 [00:11<00:00, 62.61it/s]
100%|██████████| 174/174 [00:00<00:00, 331.93it/s]


Epoch [11], Train Loss : [0.18844] Val Loss : [0.16507] Val AUC : [0.98999] Combined Score: [0.03195]


100%|██████████| 716/716 [00:11<00:00, 61.75it/s]
100%|██████████| 174/174 [00:00<00:00, 320.39it/s]


Epoch [12], Train Loss : [0.18367] Val Loss : [0.15581] Val AUC : [0.98969] Combined Score: [0.02859]


100%|██████████| 716/716 [00:11<00:00, 63.76it/s]
100%|██████████| 174/174 [00:00<00:00, 371.34it/s]


Epoch [13], Train Loss : [0.18518] Val Loss : [0.15952] Val AUC : [0.98963] Combined Score: [0.03010]


100%|██████████| 716/716 [00:11<00:00, 61.59it/s]
100%|██████████| 174/174 [00:00<00:00, 305.74it/s]


Epoch [14], Train Loss : [0.18593] Val Loss : [0.15872] Val AUC : [0.98986] Combined Score: [0.03057]


100%|██████████| 716/716 [00:10<00:00, 65.17it/s]
100%|██████████| 174/174 [00:00<00:00, 351.86it/s]


Epoch [15], Train Loss : [0.17915] Val Loss : [0.15433] Val AUC : [0.99024] Combined Score: [0.02840]


100%|██████████| 716/716 [00:11<00:00, 64.90it/s]
100%|██████████| 174/174 [00:00<00:00, 368.82it/s]


Epoch [16], Train Loss : [0.17972] Val Loss : [0.15008] Val AUC : [0.99096] Combined Score: [0.02739]


100%|██████████| 716/716 [00:11<00:00, 61.88it/s]
100%|██████████| 174/174 [00:00<00:00, 377.30it/s]


Epoch [17], Train Loss : [0.17757] Val Loss : [0.14832] Val AUC : [0.99075] Combined Score: [0.02664]


100%|██████████| 716/716 [00:11<00:00, 64.69it/s]
100%|██████████| 174/174 [00:00<00:00, 368.58it/s]


Epoch [18], Train Loss : [0.17854] Val Loss : [0.15655] Val AUC : [0.99022] Combined Score: [0.02873]


100%|██████████| 716/716 [00:11<00:00, 63.84it/s]
100%|██████████| 174/174 [00:00<00:00, 347.65it/s]


Epoch [19], Train Loss : [0.17488] Val Loss : [0.14960] Val AUC : [0.99080] Combined Score: [0.02729]


100%|██████████| 716/716 [00:11<00:00, 62.14it/s]
100%|██████████| 174/174 [00:00<00:00, 379.10it/s]


Epoch [20], Train Loss : [0.17354] Val Loss : [0.15479] Val AUC : [0.99093] Combined Score: [0.02924]


50000it [24:46, 33.64it/s]
100%|██████████| 782/782 [00:02<00:00, 355.11it/s]


Final Combined Score before test: 0.43518


In [4]:
import numpy as np
import pandas as pd
import os
import librosa
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import random
import warnings
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.calibration import calibration_curve

warnings.filterwarnings('ignore')

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

class Config:
    SR = 32000
    N_MFCC = 13
    ROOT_FOLDER = './'
    N_CLASSES = 2
    BATCH_SIZE = 64
    N_EPOCHS = 15
    LR = 3e-4
    SEED = 42
    DROPOUT_RATE = 0.5
    HIDDEN_DIMS = [128, 256, 128]
    INITIALIZATION = 'xavier'
    ACTIVATION_FUNCTION = 'relu'

CONFIG = Config()

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CONFIG.SEED)

# 데이터 로드 및 전처리
df = pd.read_csv('./train.csv')

def label_encoder(column):
    le = LabelEncoder().fit(column)
    print(column.name, le.classes_)
    return le.transform(column)

df['class'] = label_encoder(df['label'])

# MFCC 특징 추출
def get_mfcc_feature(df, train_mode=True):
    features = []
    labels = []
    for _, row in tqdm(df.iterrows()):
        y, sr = librosa.load(row['path'], sr=CONFIG.SR)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CONFIG.N_MFCC)
        mfcc = np.mean(mfcc.T, axis=0)
        features.append(mfcc)

        if train_mode:
            label = row['class']
            labels.append(label)

    return features, labels

features, labels = get_mfcc_feature(df, True)
feature_df = pd.DataFrame({'features': features, 'class': labels})

X = np.array(feature_df['features'].tolist())
y = np.array(feature_df['class'].tolist())

# 데이터 불균형 해결
ros = RandomOverSampler(random_state=CONFIG.SEED)
X_resampled, y_resampled = ros.fit_resample(X, y)
y_resampled = torch.tensor(y_resampled).long()  # 정수형으로 변환
y_resampled = torch.nn.functional.one_hot(y_resampled, num_classes=CONFIG.N_CLASSES).float()

# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=CONFIG.SEED)

# PyTorch Dataset 클래스 정의
class CustomDataset(Dataset):
    def __init__(self, mfcc, label):
        self.mfcc = mfcc
        self.label = label

    def __len__(self):
        return len(self.mfcc)

    def __getitem__(self, index):
        return self.mfcc[index], self.label[index]

train_dataset = CustomDataset(X_train, y_train)
val_dataset = CustomDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=False)

# MLP 모델 정의
class MLP(nn.Module):
    def __init__(self, input_dim=CONFIG.N_MFCC, hidden_dims=CONFIG.HIDDEN_DIMS, output_dim=CONFIG.N_CLASSES):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dims[0])
        self.fc2 = nn.Linear(hidden_dims[0], hidden_dims[1])
        self.fc3 = nn.Linear(hidden_dims[1], hidden_dims[2])
        self.fc4 = nn.Linear(hidden_dims[2], output_dim)
        
        if CONFIG.INITIALIZATION == 'xavier':
            nn.init.xavier_uniform_(self.fc1.weight)
            nn.init.xavier_uniform_(self.fc2.weight)
            nn.init.xavier_uniform_(self.fc3.weight)
            nn.init.xavier_uniform_(self.fc4.weight)
        
        self.activation = nn.ReLU() if CONFIG.ACTIVATION_FUNCTION == 'relu' else nn.Tanh()
        self.dropout = nn.Dropout(CONFIG.DROPOUT_RATE)

    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.dropout(x)
        x = self.activation(self.fc2(x))
        x = self.dropout(x)
        x = self.activation(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)
        return torch.sigmoid(x)

# 평가 함수 정의
def expected_calibration_error(y_true, y_prob, n_bins=10):
    prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=n_bins, strategy='uniform')
    bin_totals = np.histogram(y_prob, bins=np.linspace(0, 1, n_bins + 1), density=False)[0]
    non_empty_bins = bin_totals > 0
    bin_weights = bin_totals / len(y_prob)
    bin_weights = bin_weights[non_empty_bins]
    prob_true = prob_true[:len(bin_weights)]
    prob_pred = prob_pred[:len(bin_weights)]
    ece = np.sum(bin_weights * np.abs(prob_true - prob_pred))
    return ece

def auc_brier_ece(answer_df, submission_df):
    # Check for missing values in submission_df
    if submission_df.isnull().values.any():
        raise ValueError("The submission dataframe contains missing values.")

    # Check if the number and names of columns are the same in both dataframes
    if len(answer_df.columns) != len(submission_df.columns) or not all(answer_df.columns == submission_df.columns):
        raise ValueError("The columns of the answer and submission dataframes do not match.")
        
    submission_df = submission_df[submission_df.index.isin(answer_df.index)]
    submission_df.index = range(submission_df.shape[0])
    
    # Calculate AUC for each class
    auc_scores = []
    for column in answer_df.columns:
        y_true = answer_df[column]
        y_scores = submission_df[column]
        auc = roc_auc_score(y_true, y_scores)
        auc_scores.append(auc)

    # Calculate mean AUC
    mean_auc = np.mean(auc_scores)

    brier_scores = []
    ece_scores = []
    
    # Calculate Brier Score and ECE for each class
    for column in answer_df.columns:
        y_true = answer_df[column].values
        y_prob = submission_df[column].values
        
        # Brier Score
        brier = mean_squared_error(y_true, y_prob)
        brier_scores.append(brier)
        
        # ECE
        ece = expected_calibration_error(y_true, y_prob)
        ece_scores.append(ece)
    
    # Calculate mean Brier Score and mean ECE
    mean_brier = np.mean(brier_scores)
    mean_ece = np.mean(ece_scores)
    
    # Calculate combined score
    combined_score = 0.5 * (1 - mean_auc) + 0.25 * mean_brier + 0.25 * mean_ece
    
    return combined_score

# 모델 학습 함수 정의
def train(model, optimizer, train_loader, val_loader, device):
    model.to(device)
    criterion = nn.BCELoss().to(device)
    
    best_val_score = 0
    best_model = None
    val_labels = y_val.cpu().numpy()
    
    for epoch in range(1, CONFIG.N_EPOCHS+1):
        model.train()
        train_loss = []
        for features, labels in tqdm(iter(train_loader)):
            features = features.float().to(device)
            labels = labels.float().to(device)
            
            optimizer.zero_grad()
            
            output = model(features)
            loss = criterion(output, labels)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
                    
        _val_loss, _val_score, val_outputs = validation(model, criterion, val_loader, device)
        _train_loss = np.mean(train_loss)
        
        combined_score = auc_brier_ece(pd.DataFrame(val_labels), pd.DataFrame(val_outputs))
        
        print(f'Epoch [{epoch}], Train Loss : [{_train_loss:.5f}] Val Loss : [{_val_loss:.5f}] Val AUC : [{_val_score:.5f}] Combined Score: [{combined_score:.5f}]')
            
        if best_val_score < _val_score:
            best_val_score = _val_score
            best_model = model
    
    return best_model

def multiLabel_AUC(y_true, y_scores):
    auc_scores = []
    for i in range(y_true.shape[1]):
        auc = roc_auc_score(y_true[:, i], y_scores[:, i])
        auc_scores.append(auc)
    mean_auc_score = np.mean(auc_scores)
    return mean_auc_score
    
def validation(model, criterion, val_loader, device):
    model.eval()
    val_loss, all_labels, all_probs = [], [], []
    
    with torch.no_grad():
        for features, labels in tqdm(iter(val_loader)):
            features = features.float().to(device)
            labels = labels.float().to(device)
            
            probs = model(features)
            
            loss = criterion(probs, labels)

            val_loss.append(loss.item())

            all_labels.append(labels.cpu().numpy())
            all_probs.append(probs.cpu().numpy())
        
        _val_loss = np.mean(val_loss)

        all_labels = np.concatenate(all_labels, axis=0)
        all_probs = np.concatenate(all_probs, axis=0)
        
        auc_score = multiLabel_AUC(all_labels, all_probs)
    
    return _val_loss, auc_score, all_probs

model = MLP()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CONFIG.LR)

infer_model = train(model, optimizer, train_loader, val_loader, device)

# Unlabeled 데이터에 대해 pseudo-labeling
unlabeled_path = './unlabeled_data'
unlabeled_files = [os.path.join(unlabeled_path, f) for f in os.listdir(unlabeled_path) if f.endswith('.ogg')]

def get_mfcc_feature_from_files(file_paths):
    features = []
    for file_path in tqdm(file_paths):
        y, sr = librosa.load(file_path, sr=CONFIG.SR)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CONFIG.N_MFCC)
        mfcc = np.mean(mfcc.T, axis=0)
        features.append(mfcc)
    return features

unlabeled_features = get_mfcc_feature_from_files(unlabeled_files)
unlabeled_features = np.array(unlabeled_features)
unlabeled_dataset = CustomDataset(unlabeled_features, torch.zeros((len(unlabeled_features), CONFIG.N_CLASSES)))
unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=False)

def pseudo_labeling(model, loader, device):
    model.to(device)
    model.eval()
    pseudo_labels = []
    with torch.no_grad():
        for features, _ in tqdm(iter(loader)):
            features = features.float().to(device)
            probs = model(features)
            pseudo_labels.append(probs.cpu().detach().numpy())
    return np.concatenate(pseudo_labels, axis=0)

pseudo_labels = pseudo_labeling(infer_model, unlabeled_loader, device)
pseudo_labels = (pseudo_labels > 0.5).astype(int)  # 임계값 0.5를 사용하여 pseudo-label 생성

# Pseudo-labeled 데이터를 학습 데이터에 추가
pseudo_labeled_dataset = CustomDataset(unlabeled_features, torch.tensor(pseudo_labels).float())
train_dataset_combined = torch.utils.data.ConcatDataset([train_dataset, pseudo_labeled_dataset])
train_loader_combined = DataLoader(train_dataset_combined, batch_size=CONFIG.BATCH_SIZE, shuffle=True)

# 모델을 pseudo-labeled 데이터를 포함하여 재학습
infer_model = train(model, optimizer, train_loader_combined, val_loader, device)

# 테스트 데이터 예측
test = pd.read_csv('./test.csv')
test_features, _ = get_mfcc_feature(test, False)
test_features = np.array(test_features)
test_dataset = CustomDataset(test_features, torch.zeros((len(test_features), CONFIG.N_CLASSES)))
test_loader = DataLoader(test_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=False)

def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    predictions = []
    with torch.no_grad():
        for features, _ in tqdm(iter(test_loader)):
            features = features.float().to(device)
            probs = model(features)
            probs  = probs.cpu().detach().numpy()
            predictions += probs.tolist()
    return predictions

preds = inference(infer_model, test_loader, device)

submit = pd.read_csv('./sample_submission.csv')
submit.iloc[:, 1:] = preds
submit.head()

combined_score_before_test = auc_brier_ece(pd.DataFrame(y_val.cpu().numpy()), pd.DataFrame(preds))
print(f'Final Combined Score before test: {combined_score_before_test:.5f}')

submit.to_csv('./wy_origin3.csv', index=False)


label ['fake' 'real']


55438it [26:19, 35.11it/s]
100%|██████████| 696/696 [00:12<00:00, 55.29it/s]
100%|██████████| 174/174 [00:00<00:00, 205.89it/s]


Epoch [1], Train Loss : [13.76120] Val Loss : [0.68369] Val AUC : [0.68417] Combined Score: [0.24231]


100%|██████████| 696/696 [00:12<00:00, 54.83it/s]
100%|██████████| 174/174 [00:00<00:00, 222.53it/s]


Epoch [2], Train Loss : [0.76569] Val Loss : [0.67165] Val AUC : [0.77531] Combined Score: [0.22073]


100%|██████████| 696/696 [00:13<00:00, 50.57it/s]
100%|██████████| 174/174 [00:00<00:00, 311.19it/s]


Epoch [3], Train Loss : [0.65150] Val Loss : [0.58006] Val AUC : [0.81407] Combined Score: [0.17394]


100%|██████████| 696/696 [00:14<00:00, 48.94it/s]
100%|██████████| 174/174 [00:00<00:00, 311.43it/s]


Epoch [4], Train Loss : [0.58384] Val Loss : [0.47576] Val AUC : [0.88562] Combined Score: [0.12220]


100%|██████████| 696/696 [00:13<00:00, 50.06it/s]
100%|██████████| 174/174 [00:00<00:00, 308.13it/s]


Epoch [5], Train Loss : [0.49719] Val Loss : [0.38398] Val AUC : [0.92688] Combined Score: [0.08830]


100%|██████████| 696/696 [00:13<00:00, 51.94it/s]
100%|██████████| 174/174 [00:00<00:00, 313.78it/s]


Epoch [6], Train Loss : [0.43591] Val Loss : [0.34149] Val AUC : [0.94184] Combined Score: [0.07572]


100%|██████████| 696/696 [00:13<00:00, 51.71it/s]
100%|██████████| 174/174 [00:00<00:00, 320.92it/s]


Epoch [7], Train Loss : [0.39083] Val Loss : [0.31077] Val AUC : [0.94933] Combined Score: [0.06575]


100%|██████████| 696/696 [00:14<00:00, 46.84it/s]
100%|██████████| 174/174 [00:00<00:00, 228.36it/s]


Epoch [8], Train Loss : [0.36146] Val Loss : [0.29272] Val AUC : [0.95634] Combined Score: [0.06228]


100%|██████████| 696/696 [00:14<00:00, 47.32it/s]
100%|██████████| 174/174 [00:00<00:00, 261.97it/s]


Epoch [9], Train Loss : [0.34131] Val Loss : [0.28096] Val AUC : [0.95814] Combined Score: [0.05738]


100%|██████████| 696/696 [00:13<00:00, 50.43it/s]
100%|██████████| 174/174 [00:00<00:00, 290.40it/s]


Epoch [10], Train Loss : [0.32129] Val Loss : [0.26162] Val AUC : [0.96514] Combined Score: [0.05213]


100%|██████████| 696/696 [00:13<00:00, 50.47it/s]
100%|██████████| 174/174 [00:00<00:00, 294.85it/s]


Epoch [11], Train Loss : [0.30790] Val Loss : [0.24941] Val AUC : [0.96847] Combined Score: [0.04938]


100%|██████████| 696/696 [00:13<00:00, 51.10it/s]
100%|██████████| 174/174 [00:00<00:00, 270.24it/s]


Epoch [12], Train Loss : [0.29026] Val Loss : [0.24346] Val AUC : [0.97440] Combined Score: [0.05032]


100%|██████████| 696/696 [00:13<00:00, 50.60it/s]
100%|██████████| 174/174 [00:00<00:00, 293.72it/s]


Epoch [13], Train Loss : [0.28270] Val Loss : [0.23494] Val AUC : [0.97466] Combined Score: [0.04653]


100%|██████████| 696/696 [00:13<00:00, 51.63it/s]
100%|██████████| 174/174 [00:00<00:00, 322.42it/s]


Epoch [14], Train Loss : [0.26548] Val Loss : [0.22550] Val AUC : [0.97814] Combined Score: [0.04516]


100%|██████████| 696/696 [00:13<00:00, 52.14it/s]
100%|██████████| 174/174 [00:00<00:00, 292.72it/s]


Epoch [15], Train Loss : [0.25931] Val Loss : [0.21376] Val AUC : [0.98015] Combined Score: [0.04245]


100%|██████████| 1264/1264 [00:44<00:00, 28.16it/s]
100%|██████████| 20/20 [00:00<00:00, 258.00it/s]
100%|██████████| 716/716 [00:13<00:00, 51.37it/s]
100%|██████████| 174/174 [00:00<00:00, 309.62it/s]


Epoch [1], Train Loss : [0.24764] Val Loss : [0.21982] Val AUC : [0.98047] Combined Score: [0.04470]


100%|██████████| 716/716 [00:14<00:00, 50.10it/s]
100%|██████████| 174/174 [00:00<00:00, 307.89it/s]


Epoch [2], Train Loss : [0.23730] Val Loss : [0.21682] Val AUC : [0.98137] Combined Score: [0.04413]


100%|██████████| 716/716 [00:14<00:00, 51.06it/s]
100%|██████████| 174/174 [00:00<00:00, 227.19it/s]


Epoch [3], Train Loss : [0.23084] Val Loss : [0.20547] Val AUC : [0.98305] Combined Score: [0.04102]


100%|██████████| 716/716 [00:14<00:00, 49.73it/s]
100%|██████████| 174/174 [00:00<00:00, 319.97it/s]


Epoch [4], Train Loss : [0.22646] Val Loss : [0.20406] Val AUC : [0.98373] Combined Score: [0.04137]


100%|██████████| 716/716 [00:13<00:00, 51.15it/s]
100%|██████████| 174/174 [00:00<00:00, 346.88it/s]


Epoch [5], Train Loss : [0.22016] Val Loss : [0.19438] Val AUC : [0.98326] Combined Score: [0.03664]


100%|██████████| 716/716 [00:14<00:00, 49.87it/s]
100%|██████████| 174/174 [00:00<00:00, 281.49it/s]


Epoch [6], Train Loss : [0.21939] Val Loss : [0.19570] Val AUC : [0.98457] Combined Score: [0.03802]


100%|██████████| 716/716 [00:14<00:00, 50.45it/s]
100%|██████████| 174/174 [00:00<00:00, 303.58it/s]


Epoch [7], Train Loss : [0.21552] Val Loss : [0.20258] Val AUC : [0.98421] Combined Score: [0.04117]


100%|██████████| 716/716 [00:14<00:00, 51.14it/s]
100%|██████████| 174/174 [00:00<00:00, 321.13it/s]


Epoch [8], Train Loss : [0.20859] Val Loss : [0.19084] Val AUC : [0.98494] Combined Score: [0.03673]


100%|██████████| 716/716 [00:14<00:00, 50.33it/s]
100%|██████████| 174/174 [00:00<00:00, 280.12it/s]


Epoch [9], Train Loss : [0.20639] Val Loss : [0.19010] Val AUC : [0.98662] Combined Score: [0.03803]


100%|██████████| 716/716 [00:14<00:00, 50.87it/s]
100%|██████████| 174/174 [00:00<00:00, 292.96it/s]


Epoch [10], Train Loss : [0.20302] Val Loss : [0.18138] Val AUC : [0.98674] Combined Score: [0.03489]


100%|██████████| 716/716 [00:11<00:00, 61.47it/s]
100%|██████████| 174/174 [00:00<00:00, 334.80it/s]


Epoch [11], Train Loss : [0.20021] Val Loss : [0.18377] Val AUC : [0.98687] Combined Score: [0.03554]


100%|██████████| 716/716 [00:11<00:00, 62.78it/s]
100%|██████████| 174/174 [00:00<00:00, 348.99it/s]


Epoch [12], Train Loss : [0.20069] Val Loss : [0.17884] Val AUC : [0.98803] Combined Score: [0.03513]


100%|██████████| 716/716 [00:11<00:00, 63.02it/s]
100%|██████████| 174/174 [00:00<00:00, 365.67it/s]


Epoch [13], Train Loss : [0.19265] Val Loss : [0.17748] Val AUC : [0.98808] Combined Score: [0.03413]


100%|██████████| 716/716 [00:11<00:00, 62.87it/s]
100%|██████████| 174/174 [00:00<00:00, 361.64it/s]


Epoch [14], Train Loss : [0.18810] Val Loss : [0.16575] Val AUC : [0.98888] Combined Score: [0.03114]


100%|██████████| 716/716 [00:11<00:00, 63.45it/s]
100%|██████████| 174/174 [00:00<00:00, 386.98it/s]


Epoch [15], Train Loss : [0.19060] Val Loss : [0.15917] Val AUC : [0.98934] Combined Score: [0.02912]


50000it [24:30, 34.00it/s]
100%|██████████| 782/782 [00:01<00:00, 409.81it/s]


Final Combined Score before test: 0.43702


In [1]:
import numpy as np
import pandas as pd
import os
import librosa
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import random
import warnings
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.calibration import calibration_curve

warnings.filterwarnings('ignore')

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

class Config:
    SR = 32000
    N_MFCC = 13
    N_FEATURES = 64
    ROOT_FOLDER = './'
    N_CLASSES = 2
    BATCH_SIZE = 96
    N_EPOCHS = 5
    LR = 3e-4
    SEED = 42
    DROPOUT_RATE = 0.5
    HIDDEN_DIMS = [128, 256, 128]
    INITIALIZATION = 'xavier'
    ACTIVATION_FUNCTION = 'relu'

CONFIG = Config()

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CONFIG.SEED)

# 데이터 로드 및 전처리
df = pd.read_csv(os.path.join(CONFIG.ROOT_FOLDER, 'train.csv'))

def label_encoder(column):
    le = LabelEncoder().fit(column)
    print(column.name, le.classes_)
    return le.transform(column)

df['class'] = label_encoder(df['label'])

# 음성 특징 추출 함수
def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=CONFIG.SR)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CONFIG.N_MFCC)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    mel = librosa.feature.melspectrogram(y=y, sr=sr)
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr)
    
    features = np.concatenate((mfcc, chroma, mel, contrast, tonnetz), axis=0)
    return features

def get_features_and_labels(df, train_mode=True):
    features = []
    labels = []
    for _, row in tqdm(df.iterrows()):
        file_path = os.path.join(CONFIG.ROOT_FOLDER, row['path'][2:])  # './train/' 또는 './test/' 제거
        if os.path.isfile(file_path):
            features.append(extract_features(file_path))
            if train_mode:
                labels.append(row['class'])
    if train_mode:
        return np.array(features), np.array(labels)
    return np.array(features)

# Train 데이터에서 특징 추출
X, y = get_features_and_labels(df, True)

# 데이터 불균형 해결
smote = SMOTE(random_state=CONFIG.SEED)
X_resampled, y_resampled = smote.fit_resample(X.reshape(len(X), -1), y)
X_resampled = X_resampled.reshape(len(X_resampled), CONFIG.N_FEATURES, -1)
y_resampled = torch.tensor(y_resampled).long()  # 정수형으로 변환
y_resampled = torch.nn.functional.one_hot(y_resampled, num_classes=CONFIG.N_CLASSES).float()

# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=CONFIG.SEED)

# PyTorch Dataset 클래스 정의
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        return self.features[index], self.labels[index]

train_dataset = CustomDataset(X_train, y_train)
val_dataset = CustomDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=False)

# CNN+RNN 모델 정의
class CNNRNN(nn.Module):
    def __init__(self, input_dim=CONFIG.N_FEATURES, hidden_dims=CONFIG.HIDDEN_DIMS, output_dim=CONFIG.N_CLASSES):
        super(CNNRNN, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(CONFIG.DROPOUT_RATE),
            
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(CONFIG.DROPOUT_RATE)
        )
        
        self.rnn = nn.LSTM(input_size=64*8, hidden_size=hidden_dims[0], num_layers=2, batch_first=True, dropout=CONFIG.DROPOUT_RATE, bidirectional=True)
        
        self.fc = nn.Sequential(
            nn.Linear(hidden_dims[0]*2, hidden_dims[1]),
            nn.ReLU(),
            nn.Dropout(CONFIG.DROPOUT_RATE),
            nn.Linear(hidden_dims[1], hidden_dims[2]),
            nn.ReLU(),
            nn.Dropout(CONFIG.DROPOUT_RATE),
            nn.Linear(hidden_dims[2], output_dim),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        batch_size = x.size(0)
        x = x.unsqueeze(1)  # (batch_size, 1, n_features, time_steps)
        x = self.cnn(x)
        x = x.view(batch_size, 64*8, -1).permute(0, 2, 1)  # (batch_size, time_steps, 64*8)
        x, _ = self.rnn(x)
        x = x[:, -1, :]  # 마지막 타임 스텝의 출력
        x = self.fc(x)
        return x

# 평가 함수 정의
def expected_calibration_error(y_true, y_prob, n_bins=10):
    prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=n_bins, strategy='uniform')
    bin_totals = np.histogram(y_prob, bins=np.linspace(0, 1, n_bins + 1), density=False)[0]
    non_empty_bins = bin_totals > 0
    bin_weights = bin_totals / len(y_prob)
    bin_weights = bin_weights[non_empty_bins]
    prob_true = prob_true[:len(bin_weights)]
    prob_pred = prob_pred[:len(bin_weights)]
    ece = np.sum(bin_weights * np.abs(prob_true - prob_pred))
    return ece

def auc_brier_ece(answer_df, submission_df):
    # Check for missing values in submission_df
    if submission_df.isnull().values.any():
        raise ValueError("The submission dataframe contains missing values.")

    # Check if the number and names of columns are the same in both dataframes
    if len(answer_df.columns) != len(submission_df.columns) or not all(answer_df.columns == submission_df.columns):
        raise ValueError("The columns of the answer and submission dataframes do not match.")
        
    submission_df = submission_df[submission_df.index.isin(answer_df.index)]
    submission_df.index = range(submission_df.shape[0])
    
    # Calculate AUC for each class
    auc_scores = []
    for column in answer_df.columns:
        y_true = answer_df[column]
        y_scores = submission_df[column]
        auc = roc_auc_score(y_true, y_scores)
        auc_scores.append(auc)

    # Calculate mean AUC
    mean_auc = np.mean(auc_scores)

    brier_scores = []
    ece_scores = []
    
    # Calculate Brier Score and ECE for each class
    for column in answer_df.columns:
        y_true = answer_df[column].values
        y_prob = submission_df[column].values
        
        # Brier Score
        brier = mean_squared_error(y_true, y_prob)
        brier_scores.append(brier)
        
        # ECE
        ece = expected_calibration_error(y_true, y_prob)
        ece_scores.append(ece)
    
    # Calculate mean Brier Score and mean ECE
    mean_brier = np.mean(brier_scores)
    mean_ece = np.mean(ece_scores)
    
    # Calculate combined score
    combined_score = 0.5 * (1 - mean_auc) + 0.25 * mean_brier + 0.25 * mean_ece
    
    return combined_score

# 모델 학습 함수 정의
def train(model, optimizer, train_loader, val_loader, device):
    model.to(device)
    criterion = nn.BCELoss().to(device)
    
    best_val_score = 0
    best_model = None
    val_labels = y_val.cpu().numpy()
    
    for epoch in range(1, CONFIG.N_EPOCHS+1):
        model.train()
        train_loss = []
        for features, labels in tqdm(iter(train_loader)):
            features = features.float().to(device)
            labels = labels.float().to(device)
            
            optimizer.zero_grad()
            
            output = model(features)
            loss = criterion(output, labels)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
                    
        _val_loss, _val_score, val_outputs = validation(model, criterion, val_loader, device)
        _train_loss = np.mean(train_loss)
        
        combined_score = auc_brier_ece(pd.DataFrame(val_labels), pd.DataFrame(val_outputs))
        
        print(f'Epoch [{epoch}], Train Loss : [{_train_loss:.5f}] Val Loss : [{_val_loss:.5f}] Val AUC : [{_val_score:.5f}] Combined Score: [{combined_score:.5f}]')
            
        if best_val_score < _val_score:
            best_val_score = _val_score
            best_model = model
    
    return best_model

def multiLabel_AUC(y_true, y_scores):
    auc_scores = []
    for i in range(y_true.shape[1]):
        auc = roc_auc_score(y_true[:, i], y_scores[:, i])
        auc_scores.append(auc)
    mean_auc_score = np.mean(auc_scores)
    return mean_auc_score
    
def validation(model, criterion, val_loader, device):
    model.eval()
    val_loss, all_labels, all_probs = [], [], []
    
    with torch.no_grad():
        for features, labels in tqdm(iter(val_loader)):
            features = features.float().to(device)
            labels = labels.float().to(device)
            
            probs = model(features)
            
            loss = criterion(probs, labels)

            val_loss.append(loss.item())

            all_labels.append(labels.cpu().numpy())
            all_probs.append(probs.cpu().numpy())
        
        _val_loss = np.mean(val_loss)

        all_labels = np.concatenate(all_labels, axis=0)
        all_probs = np.concatenate(all_probs, axis=0)
        
        auc_score = multiLabel_AUC(all_labels, all_probs)
    
    return _val_loss, auc_score, all_probs

model = CNNRNN()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CONFIG.LR)

infer_model = train(model, optimizer, train_loader, val_loader, device)

# Unlabeled 데이터에 대해 pseudo-labeling
unlabeled_path = os.path.join(CONFIG.ROOT_FOLDER, 'unlabeled_data')
unlabeled_files = [os.path.join(unlabeled_path, f) for f in os.listdir(unlabeled_path) if f.endswith('.ogg')]

unlabeled_features = get_features_and_labels(pd.DataFrame({'path': unlabeled_files}), False)
unlabeled_dataset = CustomDataset(unlabeled_features, torch.zeros((len(unlabeled_features), CONFIG.N_CLASSES)))
unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=False)

def pseudo_labeling(model, loader, device):
    model.to(device)
    model.eval()
    pseudo_labels = []
    with torch.no_grad():
        for features, _ in tqdm(iter(loader)):
            features = features.float().to(device)
            probs = model(features)
            pseudo_labels.append(probs.cpu().detach().numpy())
    return np.concatenate(pseudo_labels, axis=0)

pseudo_labels = pseudo_labeling(infer_model, unlabeled_loader, device)
pseudo_labels = (pseudo_labels > 0.5).astype(int)  # 임계값 0.5를 사용하여 pseudo-label 생성

# Pseudo-labeled 데이터를 학습 데이터에 추가
pseudo_labeled_dataset = CustomDataset(unlabeled_features, torch.tensor(pseudo_labels).float())
train_dataset_combined = torch.utils.data.ConcatDataset([train_dataset, pseudo_labeled_dataset])
train_loader_combined = DataLoader(train_dataset_combined, batch_size=CONFIG.BATCH_SIZE, shuffle=True)

# 모델을 pseudo-labeled 데이터를 포함하여 재학습
infer_model = train(model, optimizer, train_loader_combined, val_loader, device)

# 테스트 데이터 예측
test_df = pd.read_csv(os.path.join(CONFIG.ROOT_FOLDER, 'test.csv'))
test_features = get_features_and_labels(test_df, False)
test_dataset = CustomDataset(test_features, torch.zeros((len(test_features), CONFIG.N_CLASSES)))
test_loader = DataLoader(test_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=False)

def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    predictions = []
    with torch.no_grad():
        for features, _ in tqdm(iter(test_loader)):
            features = features.float().to(device)
            probs = model(features)
            probs  = probs.cpu().detach().numpy()
            predictions += probs.tolist()
    return predictions

preds = inference(infer_model, test_loader, device)

submit = pd.read_csv(os.path.join(CONFIG.ROOT_FOLDER, 'sample_submission.csv'))
submit.iloc[:, 1:] = preds
submit.head()

combined_score_before_test = auc_brier_ece(pd.DataFrame(y_val.cpu().numpy()), pd.DataFrame(preds))
print(f'Final Combined Score before test: {combined_score_before_test:.5f}')

submit.to_csv('./wy_origin4.csv', index=False)


label ['fake' 'real']


25082it [4:25:57,  1.57it/s]


KeyboardInterrupt: 