In [19]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib as mpl
import matplotlib.pyplot as plt  # data visualization
import seaborn as sns  # statistical data visualization

In [34]:
import torch.utils.data

torch.manual_seed(3407)

df = pd.read_csv("./ESC-50/meta/esc50.csv")


# df.head()
# 选择每个类别的前 60% 数据
def select(group, start_percent, end_percent):
    start = int(len(group) * start_percent)  # 计算 60% 的位置
    end = int(len(group) * end_percent)  # 计算 80% 的位置
    return group.iloc[start:end]


# df.groupby("category")
sampled_df = df.groupby('category').apply(lambda x: select(x,0,0.6))

# 重置索引
sampled_df = sampled_df.reset_index(drop=True)

1200

In [21]:
categories = df["category"].nunique()
print(f"有{categories}种取值")

有50种取值


接下来，准备数据集

In [22]:
import torchaudio
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB
from label import label_to_index
from torch.utils.data import Dataset, DataLoader


class SoundDataset(Dataset):
    def __init__(self):
        self.transformation = torchaudio.transforms.Spectrogram()
        self.target_sample_rate = 16_000

    def __len__(self):
        return len(df)

    def __getitem__(self, item):
        row = df.take([item], axis=0)

        file_path = "./ESC-50/audio/" + row.filename.values[0]
        waveform, sample_rate = torchaudio.load(file_path)
        if sample_rate != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=self.target_sample_rate)
            waveform = resampler(waveform)

        waveform = self.transformation(waveform)
        # mel_spectrogram = MelSpectrogram(sample_rate)(waveform)
        # mel_spectrogram_db = AmplitudeToDB()(mel_spectrogram)

        return waveform, label_to_index(row.category.values[0])


dataset = SoundDataset()
dataloader = DataLoader(dataset, shuffle=True, batch_size=128)

接下来编写模型

In [23]:
import torch.nn as nn
import torch.nn.functional as F


class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(160_000, 64)
        self.fc2 = nn.Linear(64, categories)  # 假设有10个类别

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


model = SimpleCNN()
model.to("mps")


SimpleCNN(
  (conv1): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc1): Linear(in_features=160000, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=50, bias=True)
)

In [24]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss().to("mps")
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(10):  # 迭代多个周期
    for i, (inputs, labels) in enumerate(dataloader):
        labels = labels.to("mps")
        inputs = inputs.to("mps")
        # 梯度清零
        optimizer.zero_grad()

        # 前向 + 反向 + 优化
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        print(f'Epoch: {epoch + 1}, Batch: {i + 1}, Loss: {loss.item()}')


暴力训练中
什么都没有我操了
Epoch: 1, Batch: 1, Loss: 7.473263740539551
暴力训练中
什么都没有我操了
Epoch: 1, Batch: 2, Loss: 56.15425491333008
暴力训练中
什么都没有我操了
Epoch: 1, Batch: 3, Loss: 57.1718864440918
暴力训练中
什么都没有我操了
Epoch: 1, Batch: 4, Loss: 19.05189323425293
暴力训练中
什么都没有我操了
Epoch: 1, Batch: 5, Loss: 22.901779174804688
暴力训练中
什么都没有我操了
Epoch: 1, Batch: 6, Loss: 29.38726806640625
暴力训练中
什么都没有我操了
Epoch: 1, Batch: 7, Loss: 14.476642608642578
暴力训练中
什么都没有我操了
Epoch: 1, Batch: 8, Loss: 11.876070976257324
暴力训练中
什么都没有我操了
Epoch: 1, Batch: 9, Loss: 9.965805053710938
暴力训练中
什么都没有我操了
Epoch: 1, Batch: 10, Loss: 6.164563179016113
暴力训练中
什么都没有我操了
Epoch: 1, Batch: 11, Loss: 6.350220203399658
暴力训练中
什么都没有我操了
Epoch: 1, Batch: 12, Loss: 6.858315467834473
暴力训练中
什么都没有我操了
Epoch: 1, Batch: 13, Loss: 5.705517292022705
暴力训练中
什么都没有我操了
Epoch: 1, Batch: 14, Loss: 6.0186357498168945
暴力训练中
什么都没有我操了
Epoch: 1, Batch: 15, Loss: 3.8954639434814453
暴力训练中
什么都没有我操了
Epoch: 1, Batch: 16, Loss: 5.233518600463867
暴力训练中
什么都没有我操了
Epoch: 2, Batch: 1, Loss: 3.73

KeyboardInterrupt: 