<a href="https://colab.research.google.com/github/yoonju977/AudioFX-WaveNet/blob/main/Guiter_Effetor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
import torch

# 현재 할당된 GPU 메모리 완전 초기화
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
torch.cuda.synchronize()

import torch
import gc

# 현재 사용 중인 모든 변수를 삭제
del model
del optimizer
del criterion
del dataloader
gc.collect()  # Python의 가비지 컬렉터 호출
torch.cuda.empty_cache()  # GPU 캐시 메모리 해제

print("GPU memory has been cleared and reset.")

NameError: name 'model' is not defined

In [13]:
import os
import librosa
import soundfile as sf
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint  # for gradient checkpointing
from tqdm import tqdm
import time

# PYTORCH_CUDA_ALLOC_CONF 설정
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# GPU 장치 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# SNR 계산 함수 정의
def calculate_snr(predicted, target):
    signal_power = torch.mean(target ** 2)
    noise_power = torch.mean((target - predicted) ** 2)
    snr = 10 * torch.log10(signal_power / (noise_power + 1e-8))  # dB 단위로 SNR 계산
    return snr.item()

class WaveNet(nn.Module):
    def __init__(
        self,
        layers=8,
        in_channels=1,
        dilation_channels=32,
        residual_channels=32,
        skip_channels=64,
        kernel_size=2,
    ):
        super(WaveNet, self).__init__()
        self.layers = layers
        self.dilated_convs = nn.ModuleList()
        self.residual_convs = nn.ModuleList()
        self.skip_convs = nn.ModuleList()

        for i in range(layers):
            dilation = 2**i
            padding = (kernel_size - 1) * dilation
            self.dilated_convs.append(
                nn.Conv1d(
                    in_channels if i == 0 else residual_channels,
                    dilation_channels,
                    kernel_size=kernel_size,
                    dilation=dilation,
                    padding=padding,
                )
            )
            self.residual_convs.append(
                nn.Conv1d(dilation_channels, residual_channels, kernel_size=1)
            )
            self.skip_convs.append(
                nn.Conv1d(dilation_channels, skip_channels, kernel_size=1)
            )

        self.output_layer = nn.Sequential(
            nn.ReLU(), nn.Conv1d(skip_channels, 1, kernel_size=1)
        )

    def forward(self, x):
        skip_connections = []
        out = x

        for i in range(self.layers):
            residual = out
            # gradient checkpointing 적용
            out = checkpoint(self.dilated_convs[i], out)
            out = torch.tanh(out)

            skip = self.skip_convs[i](out)
            if (
                len(skip_connections) > 0
                and skip.shape[2] != skip_connections[0].shape[2]
            ):
                min_size = min(skip.shape[2], skip_connections[0].shape[2])
                skip = skip[:, :, :min_size]
                skip_connections = [sc[:, :, :min_size] for sc in skip_connections]

            skip_connections.append(skip)

            if out.shape[2] != residual.shape[2]:
                min_size = min(out.shape[2], residual.shape[2])
                out = out[:, :, :min_size]
                residual = residual[:, :, :min_size]

            out = self.residual_convs[i](out) + residual

        out = sum(skip_connections)
        return self.output_layer(out)

class ToneDataset(Dataset):
    def __init__(self, clean_files, effect_files, sr=16000, chunk_size=None):
        self.clean_files = clean_files
        self.effect_files = effect_files
        self.sr = sr
        self.chunk_size = chunk_size  # 청크 크기 설정 (전체 파일 사용 시 None)

    def __len__(self):
        return len(self.clean_files)

    def __getitem__(self, idx):
        clean_tone, _ = librosa.load(self.clean_files[idx], sr=self.sr)
        effect_tone, _ = librosa.load(self.effect_files[idx], sr=self.sr)

        # 청크 크기가 지정된 경우, 청크로 나눔
        if self.chunk_size:
            clean_tone = clean_tone[:self.chunk_size]
            effect_tone = effect_tone[:self.chunk_size]

        return torch.tensor(clean_tone).unsqueeze(0).to(device), torch.tensor(effect_tone).unsqueeze(0).to(device)

# Google Drive 경로로 오디오 파일 경로 설정
short_audio_clean_files = [
    "/content/drive/MyDrive/dataset/fenderneckScNoEffect.wav",
    "/content/drive/MyDrive/dataset/ibanezStratCleanB+NnoEffect.wav",
    "/content/drive/MyDrive/dataset/ibanezStratCleanNeckNoEffect.wav",
    "/content/drive/MyDrive/dataset/ibanzeHuNoEffect.wav",
    "/content/drive/MyDrive/dataset/scChordsNoEffect.wav",
    "/content/drive/MyDrive/dataset/dataset3NoEffect.wav",
    "/content/drive/MyDrive/dataset/ibanzeHuChordsNoeffect.wav",
    ]

short_audio_effect_files = [
    "/content/drive/MyDrive/dataset/fenderneckScEffect.wav",
    "/content/drive/MyDrive/dataset/ibanezStratCleanB+NEffect.wav",
    "/content/drive/MyDrive/dataset/ibanezStratCleanNeckEffect.wav",
    "/content/drive/MyDrive/dataset/ibanzeHuEffect.wav",
    "/content/drive/MyDrive/dataset/scChordsEffect.wav",
    "/content/drive/MyDrive/dataset/dataset3Effect.wav",
    "/content/drive/MyDrive/dataset/ibanzeHuChordseffect.wav",
]

# 짧은 음원 데이터셋 및 DataLoader 생성
short_audio_dataset = ToneDataset(short_audio_clean_files, short_audio_effect_files)
short_audio_dataloader = DataLoader(short_audio_dataset, batch_size=1, shuffle=True)

# 모델, 손실 함수 및 옵티마이저 정의
model = WaveNet().to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 짧은 음원으로 학습
print("Training on Short Audio Files")
for epoch in range(10):
    total_loss = 0
    for clean_tone, effect_tone in tqdm(short_audio_dataloader, desc=f"Epoch {epoch+1}"):
        optimizer.zero_grad()
        output = model(clean_tone)

        if output.shape != effect_tone.shape:
            min_size = min(output.shape[2], effect_tone.shape[2])
            output = output[:, :, :min_size]
            effect_tone = effect_tone[:, :, :min_size]

        loss = criterion(output, effect_tone)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(short_audio_dataloader)
    print(f"Epoch {epoch+1}, Average Loss: {avg_loss:.4f}")

# 모델 저장
torch.save(model.state_dict(), "/content/drive/MyDrive/wavenet_model.pth")

# 1시간 음원 학습을 위한 모델 로드
print("Loading Model and Training on Long Audio File")
model.load_state_dict(torch.load("/content/drive/MyDrive/wavenet_model.pth"))
long_audio_clean_files = ["/content/drive/MyDrive/dataset/noEffect.wav"]
long_audio_effect_files = ["/content/drive/MyDrive/dataset/effect.wav"]

# 1시간 음원 데이터셋 및 DataLoader 생성 (청크 크기 설정)
long_audio_dataset = ToneDataset(long_audio_clean_files, long_audio_effect_files, chunk_size=10*16000)  # 예: 10초짜리 청크
long_audio_dataloader = DataLoader(long_audio_dataset, batch_size=1, shuffle=False)

# 1시간 음원을 청크 단위로 학습
for epoch in range(10):
    total_loss = 0
    for clean_tone, effect_tone in tqdm(long_audio_dataloader, desc=f"Epoch {epoch+1} (Long Audio)"):
        optimizer.zero_grad()
        output = model(clean_tone)

        if output.shape != effect_tone.shape:
            min_size = min(output.shape[2], effect_tone.shape[2])
            output = output[:, :, :min_size]
            effect_tone = effect_tone[:, :, :min_size]

        loss = criterion(output, effect_tone)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(long_audio_dataloader)
    print(f"Epoch {epoch+1} (Long Audio), Average Loss: {avg_loss:.4f}")

Device: cuda
Training on Short Audio Files


Epoch 1: 100%|██████████| 7/7 [00:06<00:00,  1.09it/s]


Epoch 1, Average Loss: 0.0326


Epoch 2: 100%|██████████| 7/7 [00:06<00:00,  1.10it/s]


Epoch 2, Average Loss: 0.0146


Epoch 3: 100%|██████████| 7/7 [00:06<00:00,  1.09it/s]


Epoch 3, Average Loss: 0.0115


Epoch 4: 100%|██████████| 7/7 [00:06<00:00,  1.09it/s]


Epoch 4, Average Loss: 0.0095


Epoch 5: 100%|██████████| 7/7 [00:06<00:00,  1.10it/s]


Epoch 5, Average Loss: 0.0080


Epoch 6: 100%|██████████| 7/7 [00:06<00:00,  1.10it/s]


Epoch 6, Average Loss: 0.0071


Epoch 7: 100%|██████████| 7/7 [00:06<00:00,  1.11it/s]


Epoch 7, Average Loss: 0.0065


Epoch 8: 100%|██████████| 7/7 [00:06<00:00,  1.10it/s]


Epoch 8, Average Loss: 0.0059


Epoch 9: 100%|██████████| 7/7 [00:06<00:00,  1.11it/s]


Epoch 9, Average Loss: 0.0055


Epoch 10: 100%|██████████| 7/7 [00:06<00:00,  1.09it/s]
  model.load_state_dict(torch.load("/content/drive/MyDrive/wavenet_model.pth"))


Epoch 10, Average Loss: 0.0053
Loading Model and Training on Long Audio File


Epoch 1 (Long Audio): 100%|██████████| 1/1 [00:16<00:00, 16.17s/it]


Epoch 1 (Long Audio), Average Loss: 0.0045


Epoch 2 (Long Audio): 100%|██████████| 1/1 [00:16<00:00, 16.02s/it]


Epoch 2 (Long Audio), Average Loss: 0.0045


Epoch 3 (Long Audio): 100%|██████████| 1/1 [00:16<00:00, 16.17s/it]


Epoch 3 (Long Audio), Average Loss: 0.0044


Epoch 4 (Long Audio): 100%|██████████| 1/1 [00:15<00:00, 15.97s/it]


Epoch 4 (Long Audio), Average Loss: 0.0044


Epoch 5 (Long Audio): 100%|██████████| 1/1 [00:15<00:00, 15.98s/it]


Epoch 5 (Long Audio), Average Loss: 0.0044


Epoch 6 (Long Audio): 100%|██████████| 1/1 [00:16<00:00, 16.22s/it]


Epoch 6 (Long Audio), Average Loss: 0.0043


Epoch 7 (Long Audio): 100%|██████████| 1/1 [00:16<00:00, 16.03s/it]


Epoch 7 (Long Audio), Average Loss: 0.0042


Epoch 8 (Long Audio): 100%|██████████| 1/1 [00:16<00:00, 16.00s/it]


Epoch 8 (Long Audio), Average Loss: 0.0041


Epoch 9 (Long Audio): 100%|██████████| 1/1 [00:16<00:00, 16.14s/it]


Epoch 9 (Long Audio), Average Loss: 0.0040


Epoch 10 (Long Audio): 100%|██████████| 1/1 [00:16<00:00, 16.07s/it]

Epoch 10 (Long Audio), Average Loss: 0.0039





In [15]:
import torch
import librosa

# 모델을 평가 모드로 설정
model.eval()

# 테스트할 클린 음원과 실제 이펙터 음원 파일 경로 설정
test_clean_file = "/content/drive/MyDrive/dataset/ts9_test1_in_FP32.wav"
test_effect_file = "/content/drive/MyDrive/dataset/ts9_test1_out_FP32.wav"

# 테스트 클린 음원 로드
clean_tone, _ = librosa.load(test_clean_file, sr=16000)
clean_tone_tensor = torch.tensor(clean_tone).unsqueeze(0).unsqueeze(0).to(device)

# 실제 이펙터 적용 음원 로드
effect_tone, _ = librosa.load(test_effect_file, sr=16000)
effect_tone_tensor = torch.tensor(effect_tone).unsqueeze(0).unsqueeze(0).to(device)

# 모델로 테스트 클린 음원에 대한 예측 생성
with torch.no_grad():
    predicted_effect = model(clean_tone_tensor)

# 출력 크기 맞추기
if predicted_effect.shape != effect_tone_tensor.shape:
    min_size = min(predicted_effect.shape[2], effect_tone_tensor.shape[2])
    predicted_effect = predicted_effect[:, :, :min_size]
    effect_tone_tensor = effect_tone_tensor[:, :, :min_size]

# 손실 및 SNR 계산
criterion = nn.MSELoss()
loss = criterion(predicted_effect, effect_tone_tensor).item()
snr_value = calculate_snr(predicted_effect, effect_tone_tensor)

print(f"Test Loss: {loss:.4f}")
print(f"Test SNR: {snr_value:.2f} dB")

Test Loss: 0.0160
Test SNR: -3.58 dB
