# Audio Similarity Search

## Setup

In [2]:
from datasets import load_dataset

## Load Data

In [3]:
data = load_dataset("ashraq/esc50", split="train")

Repo card metadata block was not found. Setting CardData to empty.


In [4]:
data

Dataset({
    features: ['filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take', 'audio'],
    num_rows: 2000
})

In [5]:
data["category"]

['dog',
 'chirping_birds',
 'vacuum_cleaner',
 'vacuum_cleaner',
 'thunderstorm',
 'thunderstorm',
 'door_wood_knock',
 'can_opening',
 'crow',
 'door_wood_knock',
 'door_wood_knock',
 'clapping',
 'clapping',
 'clapping',
 'dog',
 'clapping',
 'thunderstorm',
 'fireworks',
 'fireworks',
 'fireworks',
 'fireworks',
 'clapping',
 'clapping',
 'clapping',
 'chainsaw',
 'airplane',
 'mouse_click',
 'pouring_water',
 'train',
 'sheep',
 'water_drops',
 'water_drops',
 'water_drops',
 'church_bells',
 'church_bells',
 'clock_alarm',
 'keyboard_typing',
 'wind',
 'clock_alarm',
 'footsteps',
 'footsteps',
 'footsteps',
 'footsteps',
 'footsteps',
 'footsteps',
 'frog',
 'frog',
 'fireworks',
 'fireworks',
 'cow',
 'water_drops',
 'brushing_teeth',
 'brushing_teeth',
 'car_horn',
 'crackling_fire',
 'helicopter',
 'helicopter',
 'helicopter',
 'helicopter',
 'helicopter',
 'helicopter',
 'drinking_sipping',
 'rain',
 'crackling_fire',
 'insects',
 'crackling_fire',
 'crackling_fire',
 'crackl

In [6]:
len(data["category"]) # 50 different categories, 40 samples per category (50 * 40)

2000

## Displaying Audio

In [7]:
import numpy as np

audios = np.array([file["array"] for file in data["audio"]])

audios

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-1.18408203e-02, -1.03363037e-01, -1.41418457e-01, ...,
         6.98547363e-02,  4.04968262e-02,  2.74658203e-03],
       [-6.95800781e-03, -1.25122070e-02, -1.12609863e-02, ...,
         2.15270996e-01, -8.75854492e-03, -2.89031982e-01],
       ...,
       [-8.87145996e-02, -7.35473633e-02, -6.95800781e-02, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-5.01800537e-01, -2.22229004e-01,  5.27648926e-02, ...,
         6.01501465e-02,  6.01196289e-02,  5.78613281e-02],
       [-3.96728516e-04, -1.22070312e-04, -4.57763672e-04, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

In [8]:
from IPython.display import Audio, display

sound_num = 1999

sound_array = data[sound_num]["audio"]["array"]
sr = data[sound_num]["audio"]["sampling_rate"]
print(f"Class: {data[sound_num]['category']}")
display(Audio(sound_array, rate=sr))

Class: dog


## Audio Model Setup

In [9]:
import torch
from panns_inference import AudioTagging

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = AudioTagging(device=device, checkpoint_path=None)


Checkpoint path: /Users/viniciusfinger/panns_data/Cnn14_mAP=0.431.pth
Using CPU.


In [10]:
_, sound_tagged = model.inference(sound_array[None, :])

array_dimension = sound_tagged.shape[1]
array_dimension

2048

## Setup Pinecone

In [11]:
from pinecone import Pinecone, ServerlessSpec
import os

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

idx_name = "audio"

spec = ServerlessSpec(
    cloud="aws",
    region="us-east-1"
)

pc.delete_index(idx_name)

pc.create_index(
    idx_name,
    dimension=array_dimension,
    metric="cosine",
    spec=spec
)


{
    "name": "audio",
    "metric": "cosine",
    "host": "audio-rdf9dfs.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 2048,
    "deletion_protection": "disabled",
    "tags": null
}

In [12]:
idx = pc.Index(idx_name)

## Vector Upserting

In [13]:
from tqdm.auto import tqdm

batch_size = 64

for i in tqdm(range(0, len(audios), batch_size)):
    i_end = min(len(audios), i + batch_size)
    
    audios_batch = audios[i:i_end]
    
    # Reshape to 2048
    _, embedded_audios = model.inference(audios_batch)
    
    ids = [f"{id}" for id in range(i, i_end)]
    
    metadata = [{"category": category} for category in data[i:i_end]["category"]]
    
    vectors = list(zip(ids, embedded_audios.tolist(), metadata))
    
    idx.upsert(vectors=vectors)
    

  0%|          | 0/32 [00:00<?, ?it/s]

## Querying Data

In [17]:
query_audio = data[559]["audio"]["array"]
query_audio = query_audio[None, :]
_, query_embedding = model.inference(query_audio)

response = idx.query(vector=query_embedding.tolist(), top_k=3, include_metadata=True)

response

{'matches': [{'id': '559',
              'metadata': {'category': 'sea_waves'},
              'score': 0.999990702,
              'values': []},
             {'id': '1091',
              'metadata': {'category': 'sea_waves'},
              'score': 0.970856786,
              'values': []},
             {'id': '1424',
              'metadata': {'category': 'sea_waves'},
              'score': 0.967775762,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}

In [26]:
from collections import Counter

category_counts = Counter(match['metadata']['category'] for match in response['matches'])
most_common_category = category_counts.most_common(3)[0]

print(f"Most common category: {most_common_category[0]}")

for match in response['matches']:
    print(f"Audio ID: {match['id']}")
    audio = data[int(match["id"])]["audio"]["array"]
    display(Audio(audio, rate=44100))

Most common category: sea_waves
Audio ID: 559


Audio ID: 1091


Audio ID: 1424
