In [1]:
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    tf.config.experimental.set_memory_growth(gpus[0], True)
print("gpu:", gpus)

gpu: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
import openl3
import librosa
import numpy as np
import os
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics import silhouette_score
import random

random.seed(42)

paths = {
    "drones": "../data/drones",
    "not_drones": "../data/not_drones"
}

def collect_files(path):
    fs = []
    for root, _, files in os.walk(path):
        for f in files:
            if f.lower().endswith((".wav", ".mp3")):
                fs.append(os.path.join(root, f))
    return sorted(fs)

drones_all = collect_files(paths["drones"])
not_drones_all = collect_files(paths["not_drones"])

drones_sel = drones_all[: len(drones_all) // 50]
not_drones_sel = not_drones_all[: len(not_drones_all) // 50]

files = drones_sel + not_drones_sel
labels = [1]*len(drones_sel) + [0]*len(not_drones_sel)

print("дронов:", len(drones_sel))
print("не дронов:", len(not_drones_sel))
print("всего:", len(files))


дронов: 1411
не дронов: 1304
всего: 2715


In [3]:
def load_audio(path):
    if not os.path.exists(path):
        return None
    try:
        audio, sr = librosa.load(path, sr=None, mono=True)
        return audio, sr
    except Exception:
        return None

print("загрузка...")
loaded = []
with ThreadPoolExecutor(max_workers=8) as ex:
    for item in tqdm(ex.map(load_audio, files), total=len(files), desc="аудио"):
        if item is not None:
            loaded.append(item)


загрузка...


аудио: 100%|██████████| 2715/2715 [00:03<00:00, 748.21it/s] 


In [4]:
import torch
from hear21passt.base import get_basic_model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("модель...")
model = get_basic_model(mode="logits")  # эмбеддинги из финального линейного слоя
model = model.to(device)
model.eval()


  from .autonotebook import tqdm as notebook_tqdm


модель...


 Loading PASST TRAINED ON AUDISET 




100%|██████████| 329M/329M [00:29<00:00, 11.8MB/s] 


PaSST(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
     

PasstBasicWrapper(
  (mel): AugmentMelSTFT(
    winsize=800, hopsize=320
    (freqm): FrequencyMasking()
    (timem): TimeMasking()
  )
  (net): PaSST(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      (norm): Identity()
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): Sequential(
      (0): Block(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=

In [5]:
print("эмбеддинги (PaSST)...")
embeddings = []
batch_size = 16
processed = 0

for i in range(0, len(loaded), batch_size):
    batch = loaded[i:i+batch_size]
    if len(batch) == 0:
        continue

    batch_audio = []
    for audio, sr in batch:
        # PaSST всегда требует 32000 Hz
        if sr != 32000:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=32000)

        batch_audio.append(audio)

    # выравниваем длину
    target_len = max(len(a) for a in batch_audio)
    batch_audio = [librosa.util.fix_length(a, size=target_len) for a in batch_audio]

    # в тензор
    audio_tensor = torch.tensor(batch_audio, dtype=torch.float32).to(device)

    with torch.no_grad():
        # модель сама делает mel → transformer → logits
        logits = model(audio_tensor)

    # logits shape = (batch, 527)
    for row in logits.cpu().numpy():
        embeddings.append(row)
        processed += 1
        if processed % 100 == 0:
            print(f"обработано: {processed}")


эмбеддинги (PaSST)...


  audio_tensor = torch.tensor(batch_audio, dtype=torch.float32).to(device)
Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\native\SpectralOps.cpp:880.)
  return _VF.stft(  # type: ignore[attr-defined]
  with torch.cuda.amp.autocast(enabled=False):


x torch.Size([16, 1, 128, 100])
self.norm(x) torch.Size([16, 768, 12, 9])
 patch_embed :  torch.Size([16, 768, 12, 9])
 self.time_new_pos_embed.shape torch.Size([1, 768, 1, 99])
 CUT time_new_pos_embed.shape torch.Size([1, 768, 1, 9])
 self.freq_new_pos_embed.shape torch.Size([1, 768, 12, 1])
X flattened torch.Size([16, 108, 768])
 self.new_pos_embed.shape torch.Size([1, 2, 768])
 self.cls_tokens.shape torch.Size([16, 1, 768])
 self.dist_token.shape torch.Size([16, 1, 768])
 final sequence x torch.Size([16, 110, 768])




 after 12 atten blocks x torch.Size([16, 110, 768])
forward_features torch.Size([16, 768])
head torch.Size([16, 527])
обработано: 100
обработано: 200
обработано: 300
обработано: 400
обработано: 500
обработано: 600
обработано: 700
обработано: 800
обработано: 900
обработано: 1000




обработано: 1100
обработано: 1200
обработано: 1300
обработано: 1400
обработано: 1500
обработано: 1600
обработано: 1700
обработано: 1800




обработано: 1900
обработано: 2000
обработано: 2100
обработано: 2200
обработано: 2300
обработано: 2400
обработано: 2500
обработано: 2600
обработано: 2700


In [6]:
print("эмбеддингов:", len(embeddings))

if len(embeddings) == 0:
    print("нет эмбеддингов — проверь данные")
else:
    X = np.vstack(embeddings)
    y = np.array(labels[: len(X)])
    print("X форма:", X.shape)
    print("y форма:", y.shape)


эмбеддингов: 2715
X форма: (2715, 527)
y форма: (2715,)


In [7]:
score = silhouette_score(X, y)
print("sil:", score)

save_dir = "../embeddings"
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, "passt.npz")

np.savez(save_path, X=X, y=y, score=score)
print("сохранено:", save_path)


sil: 0.5162537097930908
сохранено: ../embeddings\passt.npz
