## Load the HouseX Dataset

In [3]:
import numpy as np 
import pandas as pd 
import os

all_housex_file_paths = []

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        all_housex_file_paths.append(os.path.join(dirname, filename))

In [31]:
all_housex_file_paths

['/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/TV Noise - 808.wav',
 '/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Seth Hills - Infinite (Extended Mix).wav',
 '/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Loopers - I_m Odd.wav',
 '/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Seth Hills - Rewire (Extended Mix).wav',
 '/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Seth Hills - RUSH (Extended Mix).wav',
 '/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Loopers - Feel It.wav',
 '/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Blinders _ SWACQ - Side 2 Side (SWACQ Extended Edit).wav',
 '/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Julian Jordan _ Siks - Juice.wav',
 '/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Magnificence _ Seth Hills - Fire.wav',
 '/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Julian Jordan - Tell Me The Truth.wav',
 '/ka

## Load audio file into np array

In [23]:
from scipy.io import wavfile
import librosa

def load_audio_file(file_path):
    sr, origin_data = wavfile.read(file_path)
    origin_type = origin_data.dtype
    resampled_data = librosa.resample(origin_data.T.astype('float'), orig_sr = sr, target_sr = 48000) # transpose array to librosa shape and sampling rate of 48000
    resampled_data = librosa.to_mono(resampled_data)        
    resampled_data = resampled_data.T.astype(origin_type) # transpose back to scipy.io.wavfile shape
    data_np = np.array(resampled_data)
    return data_np

In [24]:
sample_audio_np = load_audio_file('/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Seth Hills - Infinite (Extended Mix).wav')
print(sample_audio_np.shape)

(9630000,)


## Set up CUDA

In [25]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


## Generating Audio Embeddings using `laion/clap-htsat-fused`

In [26]:
from datasets import load_dataset
from transformers import pipeline
from datasets import load_dataset
from transformers import ClapModel, ClapProcessor

In [27]:
model = ClapModel.from_pretrained("laion/clap-htsat-fused").to(device)
processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")

## Sample Run

In [28]:
inputs = processor(audios=sample_audio_np, sampling_rate=48000, return_tensors="pt").to(0)
audio_embed = model.get_audio_features(**inputs)
print(audio_embed.shape)

torch.Size([1, 512])


In [30]:
audio_embed[0].detach().cpu()

tensor([-3.6928e-02,  3.7520e-03, -3.7167e-02,  6.1405e-02,  6.6014e-02,
         8.7653e-02,  2.4735e-02, -3.1322e-02, -3.8323e-02, -6.5347e-02,
        -3.3822e-02,  7.7712e-02, -4.8837e-02, -1.1541e-02, -9.3604e-03,
        -1.2527e-02,  7.7513e-03, -1.8887e-02,  1.9577e-02, -1.4694e-02,
        -6.3952e-03,  1.5949e-02,  5.5757e-03, -6.5732e-02,  3.8468e-02,
        -5.7942e-03, -3.4087e-02,  9.2308e-02, -1.8853e-02, -1.3881e-02,
         9.3275e-02,  4.7956e-03,  2.5781e-02,  1.9489e-02,  2.4513e-05,
         1.9998e-02, -4.4975e-02,  2.4285e-02,  1.3572e-03,  2.4152e-02,
        -4.2116e-02,  6.3750e-02,  4.3677e-02,  4.4852e-02, -2.4025e-02,
         1.9741e-02, -3.5951e-02, -2.8946e-02,  1.4477e-02, -7.0701e-03,
         2.7992e-02, -1.0161e-01, -2.1132e-04,  8.8176e-02, -5.5432e-02,
         8.1556e-02,  1.9538e-02,  1.1074e-04, -6.1081e-02, -1.1815e-02,
         6.1227e-02,  1.0070e-01,  9.2715e-03, -3.3668e-02, -3.6773e-02,
        -1.6872e-02,  2.6111e-02, -3.9955e-02,  2.9

## Collate and save embeddings for all songs

In [32]:
from tqdm import tqdm

all_housex_embeddings = []
N = len(all_housex_file_paths)

for i in tqdm(range(N)):
    # Get audio features
    audio_file_name = all_housex_file_paths[i]
    audio_inputs_np = load_audio_file(file_path = audio_file_name)
    
    # Construct inputs to CLAP model
    clap_inputs = processor(audios=audio_inputs_np, sampling_rate=48000, return_tensors="pt").to(device)
    
    # Get embeddings from CLAP model
    clap_outputs = model.get_audio_features(**clap_inputs)
    
    # Store the detached embeddings
    audio_embeds = clap_outputs[0].detach().cpu()
    all_housex_embeddings.append(audio_embeds)

assert len(all_housex_embeddings) == N

100%|██████████| 160/160 [04:06<00:00,  1.54s/it]


In [33]:
import pickle

with open("/kaggle/working/clap_fused_housex_saved_embeds.pkl", "wb") as f:
    pickle.dump(all_housex_embeddings, f)