In [34]:
import data
import torch
from models import imagebind_model
from models.imagebind_model import ModalityType

import pickle, os, logging
import numpy as np

## Precompute embeddings from targets

#### The embeddings are stored in a two level dict of shape: 
data_split(train/eval/test) -> list(tensor(n_samples, enc_dim))

### Select project and data path

In [36]:
project_path = '/scratch/IOSZ/waveformer/multimod-sound-separation/multimod-waveformer'
data_path = os.path.join("data", "CVSoundScapes", "cv-files")

### Define methods

In [37]:
def get_files_in_folder(directory):
    return [os.path.join(directory, f) for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
        

def compute_embeddings(embedding_dict, data_split, project_path, targets, batch_size=10):
    '''
    Mehtod which computes embeddings for audio files in a given location (with an expected structure of one folder/label).
    '''
    
    # target_list = []  # list to store all target embeddings
    
    # parse all label folders
    for idx, target in enumerate(targets):
                
        tensor_list = []  # list to store all embeddings for this target
        
        # get all folders with data
        target_folder_path = os.path.join(project_path, data_path, data_split, str(target))
        
        # get all files in a folder
        file_paths = get_files_in_folder(target_folder_path)  
         
        # Create file batches
        file_batches = [file_paths[i:i + batch_size] for i in range(0, len(file_paths), batch_size)]
        
        # parse all batches
        for batch in file_batches:

            # Load data
            inputs = {
                modality: data.load_and_transform_audio_data(batch, device)
            }

            # calculate embeddings
            with torch.no_grad():
                embedding = model(inputs)

            # extends the list by appending elements from the specified iterable
            # tensor_list.extend(embedding[ModalityType.AUDIO].cpu())
            
            # store embeddings in the dictionary
            for file_path, emb in zip(batch, embedding[ModalityType.AUDIO].cpu()):
                file_name = os.path.basename(file_path)
                
                if data_split not in embedding_dict:  # check for the data split
                    embedding_dict[data_split] = {}
                    
                if file_name in embedding_dict[data_split]:
                    print('File overriden: ', file_name)
                
                embedding_dict[data_split][file_name] = emb
            
            # release memory
            del inputs
            del embedding
            
        # stack the list into a single tensor and append it to the target list
        # target_list.append(torch.stack(tensor_list))
        
        # del tensor_list
        
    # embedding_dict[data_split] = target_list  # assign the embeddings for the given data split


### Load imagebind model

In [38]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Instantiate model
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()
model.to(device)

print(device)

cuda:0


### Select target labels

In [39]:
TARGETS = [
    "Acoustic_guitar",
    "Applause",
    "Bark",
    "Bass_drum",
    "Burping_or_eructation",
    "Bus",
    "Cello",
    "Chime",
    "Clarinet",
    "Computer_keyboard",
    "Cough",
    "Cowbell",
    "Double_bass",
    "Drawer_open_or_close",
    "Electric_piano",
    "Fart",
    "Finger_snapping",
    "Fireworks",
    "Flute",
    "Glockenspiel",
    "Gong",
    "Gunshot_or_gunfire",
    "Harmonica",
    "Hi-hat",
    "Keys_jangling",
    "Knock",
    "Laughter",
    "Meow",
    "Microwave_oven",
    "Oboe",
    "Saxophone",
    "Scissors",
    "Shatter",
    "Snare_drum",
    "Squeak",
    "Tambourine",
    "Tearing",
    "Telephone",
    "Trumpet",
    "Violin_or_fiddle",
    "Writing",
]


TARGETS_CLEAN = [
    "Acoustic guitar",
    "Applause",
    "Bark",
    "Bass drum",
    "Burping or eructation",
    "Bus",
    "Cello",
    "Chime",
    "Clarinet",
    "Computer keyboard",
    "Cough",
    "Cowbell",
    "Double bass",
    "Drawer open or close",
    "Electric piano",
    "Fart",
    "Finger snapping",
    "Fireworks",
    "Flute",
    "Glockenspiel",
    "Gong",
    "Gunshot or gunfire",
    "Harmonica",
    "Hi-hat",
    "Keys jangling",
    "Knock",
    "Laughter",
    "Meow",
    "Microwave oven",
    "Oboe",
    "Saxophone",
    "Scissors",
    "Shatter",
    "Snare drum",
    "Squeak",
    "Tambourine",
    "Tearing",
    "Telephone",
    "Trumpet",
    "Violin or fiddle",
    "Writing",
]

TARGETS_CV = [
    "female",
    "male"
]

### Run model

In [24]:
logger = logging.getLogger()

In [40]:
#embeddings_dict = { 'train': {}, 'val': {}, 'test': {} }  # define initial structure
embeddings_dict = {}

modality = 'audio'

In [41]:
# train data split
logger.setLevel(logging.ERROR)
compute_embeddings(embeddings_dict, data_split='train', project_path=project_path, targets=TARGETS_CV)  # pass by reference

In [42]:
# val data split
logger.setLevel(logging.ERROR)
compute_embeddings(embeddings_dict, data_split='val', project_path=project_path, targets=TARGETS_CV)  # pass by reference

In [43]:
# test data split
logger.setLevel(logging.ERROR)
compute_embeddings(embeddings_dict, data_split='test', project_path=project_path, targets=TARGETS_CV)  # pass by reference

In [32]:
# embeddings_dict['train']

### Save the processed embeddings as a dict on disk

In [44]:
with open('emb-imagebind-audio-cv-13-14-split-file-tens.pickle', 'wb') as handle:
    pickle.dump(embeddings_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Testing area

In [12]:
tensor_list = [torch.tensor([1,2,3],dtype=torch.float32), torch.tensor([2,3,4],dtype=torch.float32)]

torch.stack(tensor_list, dim=0)

tensor = torch.tensor([[1,2,3], [4,5,6]], dtype=torch.float32)
tensor.shape

torch.mean(torch.stack(tensor_list, dim=0), axis=0)

torch.Size([2, 3])