## Load the HouseX Dataset

In [2]:
import numpy as np 
import pandas as pd 
import os

all_housex_file_paths = []

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        all_housex_file_paths.append(os.path.join(dirname, filename))

In [3]:
all_housex_file_paths[:5]

['/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/TV Noise - 808.wav',
 '/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Seth Hills - Infinite (Extended Mix).wav',
 '/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Loopers - I_m Odd.wav',
 '/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Seth Hills - Rewire (Extended Mix).wav',
 '/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Seth Hills - RUSH (Extended Mix).wav']

## Load audio file into np array

In [4]:
from scipy.io import wavfile
import librosa

def load_audio_file(file_path):
    sr, origin_data = wavfile.read(file_path)
    origin_type = origin_data.dtype
    resampled_data = librosa.resample(origin_data.T.astype('float'), orig_sr = sr, target_sr = sr) # transpose array to librosa shape
    resampled_data = librosa.to_mono(resampled_data)        
    resampled_data = resampled_data.T.astype(origin_type) # transpose back to scipy.io.wavfile shape
    data_np = np.array(resampled_data)
    return data_np

In [5]:
sample_audio_np = load_audio_file('/kaggle/input/housex-dataset/consolidatedrawaudio/bass house/Seth Hills - Infinite (Extended Mix).wav')
print(sample_audio_np.shape)

(8847562,)


## Set up CUDA

In [6]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


## Zero Shot Classification: Sample Runs
Models
- laion/larger_clap_music_and_speech
- laion/clap-htsat-fused
- laion/clap-htsat-unfused

In [7]:
from datasets import load_dataset
from transformers import pipeline
from datasets import load_dataset
from transformers import ClapModel, ClapProcessor

In [8]:
audio_classifier_1 = pipeline(task="zero-shot-audio-classification", model="laion/larger_clap_music_and_speech", device = device)
output = audio_classifier_1(sample_audio_np, candidate_labels=["song with the genre bass house", "song with the genre future house", "song with the genre melodic house", "song with the genre progressive house"])

print(output[0]['label'])

config.json:   0%|          | 0.00/635 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/776M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

song with the genre future house


In [9]:
audio_classifier_2 = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-fused", device = device)
output = audio_classifier_2(sample_audio_np, candidate_labels=["song with the genre bass house", "song with the genre future house", "song with the genre melodic house", "song with the genre progressive house"])

print(output[0]['label'])

config.json:   0%|          | 0.00/5.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/615M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/537 [00:00<?, ?B/s]

song with the genre melodic house


In [10]:
audio_classifier_3 = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-unfused", device = device)
output = audio_classifier_3(sample_audio_np, candidate_labels=["song with the genre bass house", "song with the genre future house", "song with the genre melodic house", "song with the genre progressive house"])

print(output[0]['label'])

config.json:   0%|          | 0.00/5.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/615M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

song with the genre future house


## Consolidated Runs for Zero Shot Classification

Prompting Strategy 1
- "song with the genre $\dots$"

Metrics
- Classwise accuracy
- Classwise F1 score

In [11]:
# Label mapping
CLASS_KEY = {
    "bass house" : 0,
    "future house" : 1,
    "melodic house" : 2,
    "progressive house" : 3
}

CLASS_PROMPTS = ["song with the genre bass house", "song with the genre future house", "song with the genre melodic house", "song with the genre progressive house"]

# Get ground truth label
def get_ground_truth_label(file_name):
    text_genre = file_name.split("/")[-2]
    return CLASS_KEY[text_genre]

# Get predicted label
def get_predicted_label(output, boilerplate_text = "song with the genre "):
    output.sort(key = lambda x: -int(list(x.values())[0]))
    l = output[0]
    chosen_prompt = l['label']
    text_genre = chosen_prompt.replace(boilerplate_text, "")
    return CLASS_KEY[text_genre]

Generating results for `laion/larger_clap_music_and_speech`

In [12]:
from tqdm import tqdm

run_1_gt = []
run_1_pred = []
N = len(all_housex_file_paths)

for i in tqdm(range(N)):
    file_path = all_housex_file_paths[i]
    audio_feats = np.array(load_audio_file(file_path))
    output = audio_classifier_1(audio_feats, candidate_labels = CLASS_PROMPTS)
    gt_label = get_ground_truth_label(file_path)
    pred_label = get_predicted_label(output)
    run_1_gt.append(gt_label)
    run_1_pred.append(pred_label)

100%|██████████| 160/160 [01:32<00:00,  1.74it/s]


In [13]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
target_names = list(CLASS_KEY.keys())

print(classification_report(run_1_gt, run_1_pred, target_names=target_names, digits=4))

                   precision    recall  f1-score   support

       bass house     0.5000    0.0250    0.0476        40
     future house     0.2484    0.9750    0.3959        40
    melodic house     0.0000    0.0000    0.0000        40
progressive house     0.0000    0.0000    0.0000        40

         accuracy                         0.2500       160
        macro avg     0.1871    0.2500    0.1109       160
     weighted avg     0.1871    0.2500    0.1109       160



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Generating results for `laion/clap-htsat-fused`

In [14]:
from tqdm import tqdm

run_2_gt = []
run_2_pred = []
N = len(all_housex_file_paths)

for i in tqdm(range(N)):
    file_path = all_housex_file_paths[i]
    audio_feats = np.array(load_audio_file(file_path))
    output = audio_classifier_2(audio_feats, candidate_labels = CLASS_PROMPTS)
    gt_label = get_ground_truth_label(file_path)
    pred_label = get_predicted_label(output)
    run_2_gt.append(gt_label)
    run_2_pred.append(pred_label)

100%|██████████| 160/160 [02:25<00:00,  1.10it/s]


In [15]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
target_names = list(CLASS_KEY.keys())

print(classification_report(run_2_gt, run_2_pred, target_names=target_names, digits=4))

                   precision    recall  f1-score   support

       bass house     0.0000    0.0000    0.0000        40
     future house     0.0000    0.0000    0.0000        40
    melodic house     0.2500    1.0000    0.4000        40
progressive house     0.0000    0.0000    0.0000        40

         accuracy                         0.2500       160
        macro avg     0.0625    0.2500    0.1000       160
     weighted avg     0.0625    0.2500    0.1000       160



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Generating results for `laion/clap-htsat-unfused`

In [16]:
from tqdm import tqdm

run_3_gt = []
run_3_pred = []
N = len(all_housex_file_paths)

for i in tqdm(range(N)):
    file_path = all_housex_file_paths[i]
    audio_feats = np.array(load_audio_file(file_path))
    output = audio_classifier_3(audio_feats, candidate_labels = CLASS_PROMPTS)
    gt_label = get_ground_truth_label(file_path)
    pred_label = get_predicted_label(output)
    run_3_gt.append(gt_label)
    run_3_pred.append(pred_label)

100%|██████████| 160/160 [01:00<00:00,  2.66it/s]


In [17]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
target_names = list(CLASS_KEY.keys())

print(classification_report(run_3_gt, run_3_pred, target_names=target_names, digits=4))

                   precision    recall  f1-score   support

       bass house     0.0000    0.0000    0.0000        40
     future house     0.2500    1.0000    0.4000        40
    melodic house     0.0000    0.0000    0.0000        40
progressive house     0.0000    0.0000    0.0000        40

         accuracy                         0.2500       160
        macro avg     0.0625    0.2500    0.1000       160
     weighted avg     0.0625    0.2500    0.1000       160



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Consolidated Runs for Zero Shot Classification

Prompting Strategy 2
- "$\dots$ song"

Metrics
- Classwise accuracy
- Classwise F1 score

In [22]:
# Label mapping
CLASS_KEY = {
    "bass house" : 0,
    "future house" : 1,
    "melodic house" : 2,
    "progressive house" : 3
}

CLASS_PROMPTS = ["bass house song", "future house song", "melodic house song", "progressive house song"]

# Get ground truth label
def get_ground_truth_label(file_name):
    text_genre = file_name.split("/")[-2]
    return CLASS_KEY[text_genre]

# Get predicted label
def get_predicted_label(output, boilerplate_text = "song with the genre "):
    output.sort(key = lambda x: -int(list(x.values())[0]))
    l = output[0]
    chosen_prompt = l['label']
    text_genre = chosen_prompt.replace(boilerplate_text, "")
    return CLASS_KEY[text_genre]

Generating results for `laion/larger_clap_music_and_speech`

In [23]:
from tqdm import tqdm

run_1_gt = []
run_1_pred = []
N = len(all_housex_file_paths)

for i in tqdm(range(N)):
    file_path = all_housex_file_paths[i]
    audio_feats = np.array(load_audio_file(file_path))
    output = audio_classifier_1(audio_feats, candidate_labels = CLASS_PROMPTS)
    gt_label = get_ground_truth_label(file_path)
    pred_label = get_predicted_label(output, boilerplate_text = " song")
    run_1_gt.append(gt_label)
    run_1_pred.append(pred_label)

100%|██████████| 160/160 [01:02<00:00,  2.56it/s]


In [24]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
target_names = list(CLASS_KEY.keys())

print(classification_report(run_1_gt, run_1_pred, target_names=target_names, digits=4))

                   precision    recall  f1-score   support

       bass house     0.3696    0.4250    0.3953        40
     future house     0.1574    0.4250    0.2297        40
    melodic house     0.1667    0.0250    0.0435        40
progressive house     0.0000    0.0000    0.0000        40

         accuracy                         0.2188       160
        macro avg     0.1734    0.2188    0.1671       160
     weighted avg     0.1734    0.2188    0.1671       160



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Generating results for `laion/clap-htsat-fused`

In [25]:
from tqdm import tqdm

run_2_gt = []
run_2_pred = []
N = len(all_housex_file_paths)

for i in tqdm(range(N)):
    file_path = all_housex_file_paths[i]
    audio_feats = np.array(load_audio_file(file_path))
    output = audio_classifier_2(audio_feats, candidate_labels = CLASS_PROMPTS)
    gt_label = get_ground_truth_label(file_path)
    pred_label = get_predicted_label(output, boilerplate_text = " song")
    run_2_gt.append(gt_label)
    run_2_pred.append(pred_label)

100%|██████████| 160/160 [02:24<00:00,  1.10it/s]


In [26]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
target_names = list(CLASS_KEY.keys())

print(classification_report(run_2_gt, run_2_pred, target_names=target_names, digits=4))

                   precision    recall  f1-score   support

       bass house     0.7714    0.6750    0.7200        40
     future house     0.0000    0.0000    0.0000        40
    melodic house     0.3120    0.9750    0.4727        40
progressive house     0.0000    0.0000    0.0000        40

         accuracy                         0.4125       160
        macro avg     0.2709    0.4125    0.2982       160
     weighted avg     0.2709    0.4125    0.2982       160



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Generating results for `laion/clap-htsat-unfused`

In [27]:
from tqdm import tqdm

run_3_gt = []
run_3_pred = []
N = len(all_housex_file_paths)

for i in tqdm(range(N)):
    file_path = all_housex_file_paths[i]
    audio_feats = np.array(load_audio_file(file_path))
    output = audio_classifier_3(audio_feats, candidate_labels = CLASS_PROMPTS)
    gt_label = get_ground_truth_label(file_path)
    pred_label = get_predicted_label(output, boilerplate_text = " song")
    run_3_gt.append(gt_label)
    run_3_pred.append(pred_label)

100%|██████████| 160/160 [01:00<00:00,  2.64it/s]


In [28]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
target_names = list(CLASS_KEY.keys())

print(classification_report(run_3_gt, run_3_pred, target_names=target_names, digits=4))

                   precision    recall  f1-score   support

       bass house     0.1667    0.0250    0.0435        40
     future house     0.2468    0.9500    0.3918        40
    melodic house     0.0000    0.0000    0.0000        40
progressive house     0.0000    0.0000    0.0000        40

         accuracy                         0.2437       160
        macro avg     0.1034    0.2437    0.1088       160
     weighted avg     0.1034    0.2437    0.1088       160



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
