In [1]:
#from: https://github.com/iShkiper/DSP_24.M20_21/blob/main/%D0%9A%D0%BE%D0%B4/1.2%20STFT_on.ipynb
#dataset: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/YDEPUT/B3VNQW&version=2.0

In [2]:
import os
import glob
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import librosa
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


# Параметры аудио и STFT
SR = 44000
DURATION = 5.0          # фиксированная длительность в секундах
MAX_LEN = int(SR * DURATION)  # максимальная длина сигнала в сэмплах
N_FFT = 1024
HOP_LENGTH = 256

# Путь к папке с датасетом ESC-10
DATA_DIR = "./ESC-10"

In [3]:
 #Загружаем аудио и получаем сырые данные и метки
def load_data_and_labels(data_dir, sr=SR, max_len=MAX_LEN):
    file_paths = glob.glob(os.path.join(data_dir, "*", "*.ogg"))
    data = []
    for path in file_paths:
        label = os.path.basename(os.path.dirname(path))
        # загружаем аудио сигнал
        signal, fs = librosa.load(path, sr=sr)
        # приводим сигнал к фиксированной длине max_len
        if len(signal) < max_len:
            # дополняем нулями в конец
            pad_width = max_len - len(signal)
            signal = np.pad(signal, (0, pad_width), mode='constant')
        else:
            # обрезаем до max_len
            signal = signal[:max_len]
        data.append({"signal": signal, "label": label})
    df = pd.DataFrame(data).sample(frac=1, random_state=42).reset_index(drop=True)
    return df


# Создаём DataFrame с сырыми сигналами
df = load_data_and_labels(DATA_DIR)

# Преобразуем метки в числовые индексы
classes = sorted(df['label'].unique())
class_to_idx = {c: i for i, c in enumerate(classes)}
df['label_idx'] = df['label'].map(class_to_idx)

# Разбиваем на train и test
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df['label_idx'],
    random_state=42
)

In [4]:
# STFT
def compute_log_spectrogram(signal, n_fft=N_FFT, hop_length=HOP_LENGTH):
    spec = np.abs(librosa.stft(signal, n_fft=n_fft, hop_length=hop_length))
    log_spec = np.log1p(spec)
    return log_spec.astype(np.float32)

In [5]:
#plt.axis('off') # no axis
#librosa.display.specshow(librosa.amplitude_to_db(X_train_stft[0],ref=np.max))
#plt.colorbar(format='%+2.0f dB')
#plt.title('ОПФ')

In [6]:
# Dataset
class ESC10Dataset(Dataset):
    def __init__(self, df):
        self.signals = df['signal'].tolist()
        self.labels = df['label_idx'].tolist()

    def __len__(self):
        return len(self.signals)

    def __getitem__(self, idx):
        spec = compute_log_spectrogram(self.signals[idx])
        spec = np.expand_dims(spec, axis=0)
        return torch.from_numpy(spec), self.labels[idx]

# Создаем загрузчики батчей
batch_size = 16
train_loader = DataLoader(ESC10Dataset(train_df), batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(ESC10Dataset(test_df),  batch_size=batch_size)

In [7]:
# Определяем модель
class ConvNet(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(64, 128, 3, padding=1), nn.ReLU(), nn.AdaptiveAvgPool2d((1,1))
        )
        self.classifier = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        return self.classifier(x)

In [8]:
# Инициализация
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
model = ConvNet(num_classes=len(classes)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-3)

In [9]:
# Функции обучения и оценки
def train_epoch(model, loader):
    model.train()
    loss_sum, correct, total = 0, 0, 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        out = model(x)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        loss_sum += loss.item()*x.size(0)
        preds = out.argmax(dim=1)
        correct += (preds==y).sum().item()
        total += x.size(0)
    return loss_sum/total, correct/total


def eval_epoch(model, loader):
    model.eval()
    loss_sum, correct, total = 0, 0, 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            out = model(x)
            loss = criterion(out, y)
            loss_sum += loss.item()*x.size(0)
            preds = out.argmax(dim=1)
            correct += (preds==y).sum().item()
            total += x.size(0)
    return loss_sum/total, correct/total

In [11]:
# Тренировочный цикл
epochs = 50
for epoch in range(1, epochs+1):
    tr_loss, tr_acc = train_epoch(model, train_loader)
    val_loss, val_acc = eval_epoch(model, test_loader)
    print(f"Epoch {epoch}/{epochs}: train_loss={tr_loss:.4f}, train_acc={tr_acc:.4f}, "
          f"val_loss={val_loss:.4f}, val_acc={val_acc:.4f}")

Epoch 1/50: train_loss=1.5298, train_acc=0.3937, val_loss=1.4034, val_acc=0.4875
Epoch 2/50: train_loss=1.4312, train_acc=0.4594, val_loss=1.2792, val_acc=0.4875
Epoch 3/50: train_loss=1.3880, train_acc=0.4938, val_loss=1.2520, val_acc=0.4875
Epoch 4/50: train_loss=1.3477, train_acc=0.4750, val_loss=1.1735, val_acc=0.6125
Epoch 5/50: train_loss=1.2341, train_acc=0.5781, val_loss=1.1439, val_acc=0.5625
Epoch 6/50: train_loss=1.1556, train_acc=0.6031, val_loss=1.1159, val_acc=0.5375
Epoch 7/50: train_loss=1.2652, train_acc=0.5094, val_loss=1.1425, val_acc=0.5250
Epoch 8/50: train_loss=1.1804, train_acc=0.5500, val_loss=1.0555, val_acc=0.6625
Epoch 9/50: train_loss=1.1340, train_acc=0.5875, val_loss=0.9817, val_acc=0.5625
Epoch 10/50: train_loss=1.0495, train_acc=0.6125, val_loss=1.0410, val_acc=0.6750
Epoch 11/50: train_loss=1.0500, train_acc=0.6156, val_loss=0.9948, val_acc=0.5500
Epoch 12/50: train_loss=1.0324, train_acc=0.5969, val_loss=0.9791, val_acc=0.6500
Epoch 13/50: train_loss=0