In [2]:
import torch
from utils.Tokenizers import TokenizersConfig, Tokenizers

# load the pre-trained checkpoints
checkpoint = torch.load('audio/beats/Tokenizer_iter3_plus_AS2M.pt')

cfg = TokenizersConfig(checkpoint['cfg'])
BEATs_tokenizer = Tokenizers(cfg)
BEATs_tokenizer.load_state_dict(checkpoint['model'])
BEATs_tokenizer.eval()

# tokenize the audio and generate the labels
audio_input_16khz = torch.randn(1, 10000)
padding_mask = torch.zeros(1, 10000).bool()

labels = BEATs_tokenizer.extract_labels(audio_input_16khz, padding_mask=padding_mask)



In [3]:
import torch
from utils.BEATs import BEATs, BEATsConfig
model_path = 'audio/beats/BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt'
# load the pre-trained checkpoints
checkpoint = torch.load(model_path)

cfg = BEATsConfig(checkpoint['cfg'])
BEATs_model = BEATs(cfg)
BEATs_model.load_state_dict(checkpoint['model'])
BEATs_model.eval()

# extract the the audio representation
audio_input_16khz = torch.randn(1, 10000)
padding_mask = torch.zeros(1, 10000).bool()

representation = BEATs_model.extract_features(audio_input_16khz, padding_mask=padding_mask)[0]

In [4]:
audio_input_16khz.shape

torch.Size([1, 10000])

In [7]:
import torch
import torchaudio
import librosa

# Replace 'path_to_audio_file.wav' with the path to your actual audio file
filename = 'audio/1-9886-A-49.wav'

# Load the audio file
audio, sample_rate = librosa.load(filename)
# If you need to resample to 16 kHz
if sample_rate != 16000:
    audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)

# convert to tensor
audio_input_16khz = torch.from_numpy(audio)

In [8]:
audio, sample_rate = librosa.load(filename)

In [9]:
torch.from_numpy(audio).unsqueeze(0).shape

torch.Size([1, 110250])

In [10]:
import os
import random
import librosa
import torch
import numpy as np
import sounddevice as sd
import pandas as pd 

data = pd.read_csv('meta/esc50.csv')
# Extract the data from the json file /zhome/58/f/181392/DTU/DL/Project/DL_RELAX/meta/ontology.json
import json
with open('meta/ontology.json', 'r') as f:
    ontology = json.load(f)

# Create a dictionary mapping the class names to their corresponding indices
label_dict = {label['id']: label['name'] for label in ontology}


# Define the directory where the audio files are located
audio_dir = 'audio'

# Specify the number of audio files you want to load
num_audios = 3  # Or any other number you prefer

# Get all .wav files from the directory
all_filenames = [f for f in os.listdir(audio_dir) if f.endswith('.wav')]

# Select a random subset of filenames
selected_filenames = random.sample(all_filenames, num_audios)

# Define target sample rate and duration
target_sample_rate = 16000
duration_in_seconds = 5  # Assuming each file is 5 seconds long

# Load and process the audio files
audio_tensors = []
for filename in selected_filenames:    
    file_path = os.path.join(audio_dir, filename)
    audio, _ = librosa.load(file_path, sr=target_sample_rate, duration=duration_in_seconds)
    sd.play(audio, sample_rate)
    # Wait for the audio to finish playing
    sd.wait()
    audio_tensors.append(torch.from_numpy(audio))

# Stack into a single tensor for batch processing
audio_batch = torch.stack(audio_tensors)

# audio_batch now has shape (num_audios, target_sample_rate * duration_in_seconds)


In [11]:
import torch
from utils.BEATs import BEATs, BEATsConfig

# load the fine-tuned checkpoints
checkpoint = torch.load(model_path)

cfg = BEATsConfig(checkpoint['cfg'])
BEATs_model = BEATs(cfg)
BEATs_model.load_state_dict(checkpoint['model'])
BEATs_model.eval()

# predict the classification probability of each class
padding_mask = torch.zeros(3, audio_batch.shape[1]).bool()

probs = BEATs_model.extract_features(audio_batch, padding_mask=padding_mask)[0]
predictions = {}
for i, (top5_label_prob, top5_label_idx) in enumerate(zip(*probs.topk(k=5))):
    top5_label = [checkpoint['label_dict'][label_idx.item()] for label_idx in top5_label_idx]
    # map the label to the corresponding class
    top5_label = [label_dict[label] for label in top5_label]
    # Store the prediction in a dict for later use, the key is the filename

    predictions[selected_filenames[i]] = {'top5_label': top5_label, 'top5_label_prob': top5_label_prob}
    # Print it out with the probabilities, then also the real class using the data df
    real_class = data.loc[data["filename"] == selected_filenames[i], "category"].values[0]
    print(f'Audio {i+1}: Real class: {real_class}')
    print(f'Top 3 predictions for audio {i+1}:')
    for j in range(3):
        print(f'Prediction {j+1}: {top5_label[j]} | prob {top5_label_prob[j].item():.2f}')
    print('------------------------')



Audio 1: Real class: rain
Top 3 predictions for audio 1:
Prediction 1: Rain | prob 0.74
Prediction 2: Rain on surface | prob 0.68
Prediction 3: Raindrop | prob 0.31
------------------------
Audio 2: Real class: dog
Top 3 predictions for audio 2:
Prediction 1: Dog | prob 0.84
Prediction 2: Animal | prob 0.81
Prediction 3: Domestic animals, pets | prob 0.70
------------------------
Audio 3: Real class: sea_waves
Top 3 predictions for audio 3:
Prediction 1: Ocean | prob 0.43
Prediction 2: Waves, surf | prob 0.35
Prediction 3: Wind | prob 0.20
------------------------


In [12]:
import shutil
import os

# Define the source and destination directories
source_dir = 'audio'
destination_dir = 'audio/selected'
# first remove the existing files in the destination directory (if existing)
if os.path.exists(destination_dir):
    shutil.rmtree(destination_dir)
os.mkdir(destination_dir)

# Define target sample rate and duration
target_sample_rate = 16000
duration_in_seconds = 5  # Assuming each file is 5 seconds long

# Load and play the audio files
for i, filename in enumerate(selected_filenames):
    file_path = os.path.join(source_dir, filename)
    source_path = os.path.join(source_dir, filename)
    real_class = data.loc[data["filename"] == filename, "category"].values[0]
    destination_path = os.path.join(destination_dir, f"{predictions[filename]['top5_label'][0].replace(' ', '')}_{real_class}.wav")
    shutil.copy(source_path, destination_path)

In [1]:
%reload_ext autoreload
%autoreload 2

In [13]:
from prediction import extract_features

In [16]:
filename = 'audio/1-9886-A-49.wav'
model_path = 'audio/beats/BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt'
features = extract_features(audio_path=filename, model_path=model_path)

In [17]:
features.shape

torch.Size([1, 527])