### 将音频转化为频谱图

In [None]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
from tqdm import tqdm

def audio_to_melspectrogram(file_path, save_path):
    try:
        y, sr = librosa.load(file_path, sr=None)
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
        S_DB = librosa.power_to_db(S, ref=np.max)

        plt.figure(figsize=(2.56, 2.56), dpi=100)
        librosa.display.specshow(S_DB, sr=sr, cmap='magma')
        plt.axis('off')
        plt.tight_layout(pad=0)
        plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
        plt.close()
    except Exception as e:
        print(f"⚠️ Error on {file_path}: {e}")

TRAIN_AUDIO_DIR = Path("train_audio")            
TRAIN_IMG_DIR   = Path("working/train_images")   
TRAIN_IMG_DIR.mkdir(parents=True, exist_ok=True) 

ogg_files = list(TRAIN_AUDIO_DIR.rglob("*.ogg"))

for audio_fp in tqdm(ogg_files, desc="Converting", unit="file"):
    # 相对路径（例如 21116/iNat296867.ogg）
    rel_fp   = audio_fp.relative_to(TRAIN_AUDIO_DIR)
    img_fp   = TRAIN_IMG_DIR / rel_fp.with_suffix(".png")  # 改后缀
    img_fp.parent.mkdir(parents=True, exist_ok=True)       # 递归建子目录

    try:
        audio_to_melspectrogram(str(audio_fp), str(img_fp))
    except Exception as e:
        print(f"[WARN] 处理 {audio_fp} 失败：{e}")


Converting:   6%|▌         | 1722/28564 [10:59<1:52:50,  3.96file/s] 

### 数据集

In [7]:
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image
import pandas as pd
from pathlib import Path

class SpectrogramDataset(Dataset):
    def __init__(self, csv_path, image_root, transform=None):
        self.df = pd.read_csv(csv_path)
        self.image_root = Path(image_root)
        self.transform = transform

        # primary_label -> idx
        self.label2idx = {l: i for i, l in enumerate(sorted(self.df.primary_label.unique()))}
        self.num_labels = len(self.label2idx)

        self.samples = []
        for _, row in self.df.iterrows():
            png_path = self.image_root / Path(row.filename).with_suffix(".png")
            if not png_path.exists():
                continue

            # --------- 组装 multi‑label ---------
            labels = [row.primary_label]
            if pd.notna(row.secondary_labels):
                labels += eval(row.secondary_labels)  # 列表字符串 → list
            idxs = [self.label2idx[l] for l in labels if l in self.label2idx]

            target = torch.zeros(self.num_labels, dtype=torch.float32)
            target[idxs] = 1.0

            self.samples.append(
                dict(
                    image_path=png_path,
                    target=target,
                    primary_idx=self.label2idx[row.primary_label],
                    latitude=row.latitude if pd.notna(row.latitude) else 0.0,
                    longitude=row.longitude if pd.notna(row.longitude) else 0.0,
                )
            )

    # 供外部调用
    def __len_of_label__(self):
        return self.num_labels

    # -------- PyTorch API --------
    def __len__(self):  return len(self.samples)

    def __getitem__(self, idx):
        s = self.samples[idx]
        img = Image.open(s["image_path"]).convert("RGB")
        if self.transform: img = self.transform(img)

        # 归一化地理坐标到 [-1,1]
        lat = torch.tensor(s["latitude"] / 90.0, dtype=torch.float32)
        lon = torch.tensor(s["longitude"] / 180.0, dtype=torch.float32)

        return {
            "image": img,               # float tensor [3,224,224]
            "coords": torch.stack([lat, lon]),  # [2]
            "target": s["target"],      # multi‑hot [num_labels]
        }


transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# 建立数据集
dataset = SpectrogramDataset(
    csv_path="train.csv",
    image_root="working/train_images",
    transform=transform
)

sample = dataset[0]
print('样本总数:', len(dataset))  # 样本总数
print(sample)

样本总数: 2197
{'image': tensor([[[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.1176, 0.0275],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.4235, 0.0980],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.9882, 0.9294, 0.9804],
         [0.0000, 0.0000, 0.0000,  ..., 0.9216, 0.7804, 0.9451],
         [0.0000, 0.0000, 0.0000,  ..., 0.8941, 0.7216, 0.9294]],

        [[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0353, 0.0078],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.1255, 0.0275],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.5686, 0.3804, 0.7255],
         [0.0000, 0.0000, 0.0000,  ..., 0.4196, 0.2902, 0.5765],
         [0.0000, 0.0000, 0.0000,  ..., 0.3608, 0.2549, 0.5216]],

        [[0.0157, 0.0157, 0.0157,  ..., 0.0157, 0.0157, 0.0157],
         [0.0157, 0.0157, 0.0157,  ..., 0.0157, 0.1137, 0.0392],
         [0.0157, 0.0157, 0.0157,  ..

### CNN模型

In [12]:
import torch.nn as nn
import torchvision.models as models
import torch

class BirdNet(nn.Module):
    def __init__(self, num_labels):
        super().__init__()

        # 视觉分支：ResNet18 预训练，替换最后 fc
        backbone = models.resnet18(weights=None)
        state_dict = torch.load("resnet18-f37072fd.pth")
        backbone.load_state_dict(state_dict)
        
        self.backbone = nn.Sequential(*list(backbone.children())[:-1])  # 输出 512×1×1
        self.img_head = nn.Sequential(nn.Flatten())                     # 512

        # 经纬度分支
        self.coord_head = nn.Sequential(
            nn.Linear(2, 32),
            nn.ReLU(inplace=True),
            nn.Linear(32, 128),
            nn.ReLU(inplace=True),
        )

        # 分类器
        self.classifier = nn.Linear(512 + 128, num_labels)

    def forward(self, img, coords):
        x_img = self.backbone(img)          # [B,512,1,1]
        x_img = self.img_head(x_img)        # [B,512]

        x_geo = self.coord_head(coords)     # [B,128]

        x = torch.cat([x_img, x_geo], dim=1)  # [B,640]
        logits = self.classifier(x)           # [B,num_labels]
        return torch.sigmoid(logits)          # 概率 [0‑1]

### 训练

In [None]:
from torch.utils.data import DataLoader, random_split
from torchvision import transforms
from tqdm import tqdm

# ----- 数据 -----
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])
ds = SpectrogramDataset("train.csv", "working/train_images", transform)
train_ds, val_ds = random_split(ds, [0.9, 0.1])
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)
val_loader   = DataLoader(val_ds, batch_size=32, shuffle=False, num_workers=4)

# ----- 模型 -----
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
model = BirdNet(num_labels=ds.__len_of_label__()).to(device)

criterion = nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

NUM_EPOCHS = 5

# ----- 训练循环 -----
for epoch in range(NUM_EPOCHS):  # 5个 epoch
    model.train()
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/5"):
        img   = batch["image"].to(device)
        coords = batch["coords"].to(device)
        target = batch["target"].to(device)

        optimizer.zero_grad()
        preds = model(img, coords)
        loss  = criterion(preds, target)
        loss.backward()
        optimizer.step()

    # ---- 简单验证 ----
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            img, coords, target = batch["image"].to(device), batch["coords"].to(device), batch["target"].to(device)
            val_loss += criterion(model(img, coords), target).item() * img.size(0)
    print(f"Epoch {epoch+1}: val BCE={val_loss/len(val_ds):.4f}")


Using device: cuda


  state_dict = torch.load("resnet18-f37072fd.pth")
Epoch 1/5:   0%|          | 0/62 [00:00<?, ?it/s]