In [1]:
import os
import librosa
import numpy as np
import torch
from tqdm import tqdm
from sklearn.metrics import silhouette_score
from concurrent.futures import ThreadPoolExecutor


# пути и параметры

paths = {
    "drones": "../data/drones",
    "not_drones": "../data/not_drones"
}

SAVE_DIR = "../embeddings"
SAVE_PATH = os.path.join(SAVE_DIR, "vggish.npz")

os.makedirs(SAVE_DIR, exist_ok=True)


In [2]:
# функция сбора файлов

def collect_files(path):
    out = []
    for root, _, fs in os.walk(path):
        for f in fs:
            if f.lower().endswith((".wav", ".mp3")):
                out.append(os.path.join(root, f))
    return sorted(out)



# собираем 50% файлов

drones_all = collect_files(paths["drones"])
not_drones_all = collect_files(paths["not_drones"])

drones_sel = drones_all[: len(drones_all) // 50]
not_drones_sel = not_drones_all[: len(not_drones_all) // 50]

files = drones_sel + not_drones_sel
labels = [1] * len(drones_sel) + [0] * len(not_drones_sel)

print("дронов:", len(drones_sel))
print("не дронов:", len(not_drones_sel))
print("всего:", len(files))

дронов: 1411
не дронов: 1304
всего: 2715


In [3]:
# загрузка аудио

def load_audio(path):
    try:
        audio, sr = librosa.load(path, sr=None, mono=True)
        return path, audio, sr
    except:
        return None


print("загрузка...")
loaded = []
with ThreadPoolExecutor(max_workers=8) as ex:
    for item in tqdm(ex.map(load_audio, files), total=len(files), desc="аудио"):
        if item is not None:
            loaded.append(item)

print("загружено:", len(loaded))


загрузка...


аудио: 100%|██████████| 2715/2715 [00:03<00:00, 717.31it/s] 

загружено: 2715





In [4]:
# загрузка VGGish
print("модель...")
model = torch.hub.load('harritaylor/torchvggish', 'vggish')
model.eval()

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print("ok")

модель...


Using cache found in C:\Users\Samat/.cache\torch\hub\harritaylor_torchvggish_master


ok


In [5]:
# --------------------------
# эмбеддинги
# --------------------------
print("эмбеддинги...")
embeddings = []
processed = 0

for path, audio, sr in loaded:

    # vggish требует 16k
    if sr != 16000:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)

    audio = audio.astype(np.float32)

    # минимум 1 секунда
    if len(audio) < 16000:
        audio = np.pad(audio, (0, 16000 - len(audio)), mode="constant")

    emb = None

    # пробуем подать numpy → vggish
    try:
        out = model.forward(audio)
        if out is not None and len(out) > 0:
            emb = out
    except:
        pass

    # если numpy не сработал → пробуем путь
    if emb is None:
        try:
            out = model.forward(path)
            if out is not None and len(out) > 0:
                emb = out
        except:
            continue

    if emb is None:
        continue

    # усредняем
    e = emb.mean(axis=0)

    # приводим тензор → numpy
    if isinstance(e, torch.Tensor):
        e = e.detach().cpu().numpy()

    # проверяем форму
    if e.shape != (128,):
        continue

    embeddings.append(e)
    processed += 1

    if processed % 100 == 0:
        print("обработано:", processed)

print("эмбеддингов:", len(embeddings))

эмбеддинги...
обработано: 100
обработано: 200
обработано: 300
обработано: 400
обработано: 500
обработано: 600
обработано: 700
обработано: 800
обработано: 900
обработано: 1000
обработано: 1100
эмбеддингов: 1122


In [6]:
# --------------------------
# формируем X, y
# --------------------------
if len(embeddings) == 0:
    print("нет эмбеддингов — ошибка")
    raise SystemExit

X = np.vstack(embeddings)
y = np.array(labels[:len(X)])

print("X:", X.shape)
print("y:", y.shape)

# --------------------------
# silhouette
# --------------------------
score = silhouette_score(X, y)
print("sil:", score)

# --------------------------
# сохранение
# --------------------------
np.savez(SAVE_PATH, X=X, y=y, score=score)
print("сохранено:", SAVE_PATH)

X: (1122, 128)
y: (1122,)


ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)