## Load the HouseX Dataset

In [1]:
import numpy as np 
import pandas as pd 
import os

all_housex_file_paths = []

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        all_housex_file_paths.append(os.path.join(dirname, filename))

In [2]:
all_housex_file_paths

['/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/TV Noise - 808.wav',
 '/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Seth Hills - Infinite (Extended Mix).wav',
 '/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Loopers - I_m Odd.wav',
 '/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Seth Hills - Rewire (Extended Mix).wav',
 '/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Seth Hills - RUSH (Extended Mix).wav',
 '/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Loopers - Feel It.wav',
 '/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Blinders _ SWACQ - Side 2 Side (SWACQ Extended Edit).wav',
 '/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Julian Jordan _ Siks - Juice.wav',
 '/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Magnificence _ Seth Hills - Fire.wav',
 '/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Julian Jordan - Tell Me The Truth.wav',
 '/ka

## Load audio file into np array

In [3]:
from scipy.io import wavfile
import librosa

def load_audio_file(file_path):
    sr, origin_data = wavfile.read(file_path)
    origin_type = origin_data.dtype
    resampled_data = librosa.resample(origin_data.T.astype('float'), orig_sr = sr, target_sr = 48000) # transpose array to librosa shape and sampling rate of 48000
    resampled_data = librosa.to_mono(resampled_data)        
    resampled_data = resampled_data.T.astype(origin_type) # transpose back to scipy.io.wavfile shape
    data_np = np.array(resampled_data)
    return data_np

In [4]:
sample_audio_np = load_audio_file('/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Seth Hills - Infinite (Extended Mix).wav')
print(sample_audio_np.shape)

(9630000,)


## Set up CUDA

In [5]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


## Generating Audio Embeddings using `laion/clap-htsat-fused`

In [6]:
from datasets import load_dataset
from transformers import pipeline
from datasets import load_dataset
from transformers import ClapModel, ClapProcessor

2024-02-02 07:15:25.515899: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-02 07:15:25.515959: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-02 07:15:25.517907: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [7]:
model = ClapModel.from_pretrained("laion/clap-htsat-fused").to(device)
processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")

  return self.fget.__get__(instance, owner)()


## Sample Run

In [8]:
inputs = processor(audios=sample_audio_np, sampling_rate=48000, return_tensors="pt").to(0)
audio_embed = model.get_audio_features(**inputs)
print(audio_embed.shape)

torch.Size([1, 512])


In [9]:
audio_embed[0].detach().cpu()

tensor([-0.0401,  0.0051, -0.0358,  0.0607,  0.0674,  0.0887,  0.0251, -0.0313,
        -0.0392, -0.0640, -0.0332,  0.0794, -0.0490, -0.0159, -0.0108, -0.0118,
         0.0103, -0.0226,  0.0163, -0.0134, -0.0083,  0.0162,  0.0118, -0.0627,
         0.0371, -0.0049, -0.0395,  0.0911, -0.0186, -0.0123,  0.0943,  0.0052,
         0.0270,  0.0187,  0.0004,  0.0206, -0.0433,  0.0259, -0.0020,  0.0214,
        -0.0398,  0.0635,  0.0459,  0.0421, -0.0203,  0.0145, -0.0349, -0.0315,
         0.0127, -0.0041,  0.0315, -0.1003,  0.0016,  0.0869, -0.0550,  0.0782,
         0.0194,  0.0015, -0.0600, -0.0121,  0.0633,  0.1018,  0.0075, -0.0325,
        -0.0366, -0.0156,  0.0259, -0.0367,  0.0284,  0.0525, -0.0338,  0.0618,
         0.0121,  0.0304, -0.0839, -0.0604,  0.0035,  0.0099, -0.0134,  0.0470,
         0.0705, -0.0160,  0.0330,  0.0021,  0.0444, -0.0131,  0.0218,  0.0206,
         0.0045,  0.0197,  0.0156,  0.0606, -0.0305,  0.1190,  0.0166,  0.0115,
         0.0179,  0.0128,  0.0086, -0.03

## Collate and save embeddings for all songs

In [10]:
from tqdm import tqdm

all_housex_embeddings_dict = {}
N = len(all_housex_file_paths)

for i in tqdm(range(N)):
    # Get audio features
    audio_file_name = all_housex_file_paths[i]
    audio_genre = audio_file_name.split("/")[-2].replace(" ", "_")
    audio_inputs_np = load_audio_file(file_path = audio_file_name)
    
    # Construct inputs to CLAP model
    clap_inputs = processor(audios=audio_inputs_np, sampling_rate=48000, return_tensors="pt").to(device)
    
    # Get embeddings from CLAP model
    clap_outputs = model.get_audio_features(**clap_inputs)
    
    # Store the detached embeddings
    audio_embeds = clap_outputs[0].detach().cpu()
    
    # Insert into the main dictionary
    if audio_genre not in all_housex_embeddings_dict:
        all_housex_embeddings_dict[audio_genre] = []
    
    all_housex_embeddings_dict[audio_genre].append(audio_embeds)

100%|██████████| 160/160 [04:31<00:00,  1.70s/it]


In [12]:
import pickle

with open("/kaggle/working/clap_fused_housex_saved_embeds_dict.pkl", "wb") as f:
    pickle.dump(all_housex_embeddings_dict, f)