In [1]:
import torch
import torch.nn as nn
import yaml
import os
import pandas as pd
import matplotlib.pyplot as plt
import torchaudio

In [2]:
from pathlib import Path
import time

In [3]:
from panns_models import *

In [4]:
from re import A, S
import sys
import librosa
import numpy as np
import argparse
import h5py
import math
import time
import logging
import pickle
import random
from datetime import datetime

import torch
import torch.optim as optim
from torch.utils.data import DataLoader, sampler, Subset

from utils import create_folder, dump_config, process_idc, prepprocess_audio, init_hier_head

import config
from sed_model import SEDWrapper, Ensemble_SEDWrapper
from models import Cnn14_DecisionLevelMax
from data_generator import SEDDataset, DESED_Dataset, ESC_Dataset, SCV2_Dataset


from model.htsat import HTSAT_Swin_Transformer
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
import warnings

from datasets import SEDDataset

warnings.filterwarnings("ignore")

2023-08-08 01:14:25.022028: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-08 01:14:26.384912: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-08 01:14:26.393749: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
from torch.utils.data import Dataset

In [6]:
with open("../confs/default.yaml", "r") as f:
        configs = yaml.safe_load(f)

In [7]:
logging.basicConfig(level=logging.INFO) 
pl.utilities.seed.seed_everything(seed = config.random_seed)

Global seed set to 970131


970131

In [8]:
SAMPLE_RATE = configs["data"]["fs"]
N_FFT = configs["feats"]["n_window"]
WIN_LENGTH = configs["feats"]["n_window"]
HOP_LENGTH = configs["feats"]["hop_length"]
F_MIN = configs["feats"]["f_min"]
F_MAX = configs["feats"]["f_max"]
N_MELS = configs["feats"]["n_mels"]
WINDOW_FN = torch.hamming_window
WKWARGS = {"periodic": False}
POWER = 1
NUM_SAMPLES = SAMPLE_RATE

LEARNING_RATE = configs["opt"]["lr"]
epochs = 5
BATCH_SIZE = 8

    #frame_length_in_seconds
frame_length_sec = HOP_LENGTH / SAMPLE_RATE
device = "cuda" if torch.cuda.is_available() else "cpu"

In [9]:
device

'cpu'

In [10]:
train_dataset = SEDDataset(annotations_file = "../" + configs["data"]["weak_tsv"], 
                                          audio_dir = "../" + configs["data"]["weak_folder"], 
                                          transformation = None, 
                                          target_sample_rate = SAMPLE_RATE,
                                          num_samples = NUM_SAMPLES,
                                          label_column = "event_labels",
                                          device = device)

INFO:root:total dataset size: 1578
INFO:root:class num: 527


527


In [11]:
eval_dataset = SEDDataset(annotations_file = "../" + configs["data"]["val_tsv"],
                                          audio_dir = "../" + configs["data"]["val_folder"],
                                          transformation = None, 
                                          target_sample_rate = SAMPLE_RATE,
                                          num_samples = NUM_SAMPLES,
                                          label_column = "event_label",
                                          device = device)

INFO:root:total dataset size: 4239
INFO:root:class num: 527


527


In [19]:
percentage_train = int(np.ceil(0.95*len(train_dataset)))
list_train_indices = [num for num in range(percentage_train)]
list_val_indices = [num for num in range(percentage_train, len(train_dataset))]
val_dataset = Subset(train_dataset, list_val_indices)
train_dataset = Subset(train_dataset, list_train_indices)


In [17]:
class data_prep(pl.LightningDataModule):
    def __init__(self, train_dataset, eval_dataset, test_dataset):
        super().__init__()
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.test_dataset = test_dataset
        #self.device_num = device_num

    def train_dataloader(self):
        train_sampler = None
        train_loader = DataLoader(
            dataset = self.train_dataset,
            num_workers = config.num_workers,
            batch_size = config.batch_size,
            shuffle = False,
            sampler = train_sampler
        )
        return train_loader
    
    def val_dataloader(self):
        eval_sampler = None
        eval_loader = DataLoader(
            dataset = self.eval_dataset,
            num_workers = config.num_workers,
            batch_size = config.batch_size,
            shuffle = False,
            sampler = eval_sampler
        )
        return eval_loader
    
    def test_dataloader(self):
        test_sampler = None
        test_loader = DataLoader(
            dataset = self.test_dataset,
            num_workers = config.num_workers,
            batch_size = config.batch_size,
            shuffle = False,
            sampler = test_sampler
        )
        return test_loader

In [71]:
config.batch_size

2

In [23]:
train_dataset = Subset(train_dataset, np.arange(20))
val_dataset = Subset(val_dataset, np.arange(5))
eval_dataset = Subset(eval_dataset, np.arange(5))

In [74]:
tb_logger = pl.loggers.TensorBoardLogger(save_dir='./logs/', name='panns_pred')

In [None]:
#train_dataset = ReducedDataset(train_dataset, 100)

In [None]:
#val_dataset = ReducedDataset(val_dataset, 50)

In [20]:
audioset_data = data_prep(train_dataset, val_dataset, eval_dataset)

In [77]:
"""for idx, batch in enumerate(audioset_data.train_dataloader()):
    print(idx)
    audio_data, labels = batch["waveform"], batch["target"]
    if not audio_data.size() == torch.Size([2, 160000]):
        print(audio_data.size())"""

'for idx, batch in enumerate(audioset_data.train_dataloader()):\n    print(idx)\n    audio_data, labels = batch["waveform"], batch["target"]\n    if not audio_data.size() == torch.Size([2, 160000]):\n        print(audio_data.size())'

In [78]:
checkpoint_callback = ModelCheckpoint(
            monitor = "mAP",
            filename='l-{epoch:d}-{mAP:.3f}-{mAUC:.3f}',
            save_top_k = 20,
            mode = "max"
        )

In [79]:
trainer = pl.Trainer(
        deterministic=False,
        gpus = None, 
        max_epochs = 5,
        auto_lr_find = True,    
        sync_batchnorm = True,
        num_sanity_val_steps = 5,
        # resume_from_checkpoint = config.resume_checkpoint,
        gradient_clip_val=1.0,
        logger=tb_logger,
    )

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [80]:
"""sed_model = HTSAT_Swin_Transformer(
        spec_size=config.htsat_spec_size,
        patch_size=config.htsat_patch_size,
        in_chans=1,
        num_classes=config.classes_num,
        window_size=config.htsat_window_size,
        config = config,
        depths = config.htsat_depth,
        embed_dim = config.htsat_dim,
        patch_stride=config.htsat_stride,
        num_heads=config.htsat_num_head
    )"""

'sed_model = HTSAT_Swin_Transformer(\n        spec_size=config.htsat_spec_size,\n        patch_size=config.htsat_patch_size,\n        in_chans=1,\n        num_classes=config.classes_num,\n        window_size=config.htsat_window_size,\n        config = config,\n        depths = config.htsat_depth,\n        embed_dim = config.htsat_dim,\n        patch_stride=config.htsat_stride,\n        num_heads=config.htsat_num_head\n    )'

In [81]:
"""model = SEDWrapper(
        sed_model = sed_model, 
        config = config,
        dataset = train_dataset
    )"""

'model = SEDWrapper(\n        sed_model = sed_model, \n        config = config,\n        dataset = train_dataset\n    )'

In [82]:
# model
model_config = {
    "sample_rate": 16000,
    "window_size": 1024,
    "hop_size": 320,
    "mel_bins": 64,
    "fmin": 50,
    "fmax": 14000,
    "classes_num": 10
}
model_config["classes_num"] = 527
model = PANNsCNN14Att(**model_config)
#weights = torch.load("Cnn14_DecisionLevelAtt_mAP0.425.pth", map_location = "cpu")
# Fixed in V3
#model.load_state_dict(weights["model"])
model.att_block = AttBlock(2048, 527, activation='sigmoid')

In [83]:
model_panns = SEDWrapper(
        sed_model = model, 
        config = config,
        dataset = train_dataset
    )

In [84]:
# Get the first batch from the DataLoader
"""first_batch = next(iter(audioset_data.train_dataloader()))

# Extract the input and target data from the batch
input_data = first_batch['waveform']
target_data = first_batch['target']

# Print the size of the input and target data
print("Input Data Size:", input_data.size())
print("Target Data Size:", target_data.size())"""

'first_batch = next(iter(audioset_data.train_dataloader()))\n\n# Extract the input and target data from the batch\ninput_data = first_batch[\'waveform\']\ntarget_data = first_batch[\'target\']\n\n# Print the size of the input and target data\nprint("Input Data Size:", input_data.size())\nprint("Target Data Size:", target_data.size())'

In [85]:
model_panns

SEDWrapper(
  (sed_model): PANNsCNN14Att(
    (spectrogram_extractor): Spectrogram(
      (stft): STFT(
        (conv_real): Conv1d(1, 513, kernel_size=(1024,), stride=(320,), bias=False)
        (conv_imag): Conv1d(1, 513, kernel_size=(1024,), stride=(320,), bias=False)
      )
    )
    (logmel_extractor): LogmelFilterBank()
    (spec_augmenter): SpecAugmentation(
      (time_dropper): DropStripes()
      (freq_dropper): DropStripes()
    )
    (bn0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv_block1): ConvBlock(
      (conv1): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (conv_block2): ConvBlock(
      (conv1): Conv2d(64,

In [86]:
trainer.fit(model_panns, audioset_data.train_dataloader())

Missing logger folder: ./logs/panns_pred

  | Name      | Type          | Params
--------------------------------------------
0 | sed_model | PANNsCNN14Att | 82.9 M
--------------------------------------------
81.8 M    Trainable params
1.1 M     Non-trainable params
82.9 M    Total params
331.672   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


In [251]:
def get_model(config: dict, weights_path: str, device):
    model = PANNsCNN14Att(**config)
    checkpoint = torch.load(weights_path, map_location = "cpu")
    prefix_to_remove = "sed_model."
    new_checkpoint = {k.replace(prefix_to_remove, ""): v for k, v in checkpoint["state_dict"].items()}
    model.load_state_dict(new_checkpoint)
    model.to(device)
    model.eval()
    return model

In [300]:
def prediction(test_df: pd.DataFrame,
               test_audio: Path,
               model_config: dict,
               weights_path: str,
               SR,
               threshold=0.5):
    model = get_model(model_config, weights_path, device)
    unique_audio_id = test_df.filename.unique()

    warnings.filterwarnings("ignore")
    prediction_dfs = []
    for audio_id in unique_audio_id:
        #with timer(f"Loading {audio_id}"):
        clip, _ = librosa.load(os.path.join(test_audio, audio_id),
                                   sr=SR,
                                   mono=True,
                                   res_type="kaiser_fast")
        
        test_df_for_audio_id = test_df.query(
            f"audio_id == '{audio_id}'").reset_index(drop=True)
        #with timer(f"Prediction on {audio_id}"):
        prediction_df = prediction_for_clip(test_df_for_audio_id,
                                                clip=clip,
                                                model=model,
                                                threshold=threshold)

        prediction_dfs.append(prediction_df)
    
    prediction_df = pd.concat(prediction_dfs, axis=0, sort=False).reset_index(drop=True)
    return prediction_df

In [287]:
def prediction_for_clip(test_df: pd.DataFrame,
                        clip: np.ndarray, 
                        model: PANNsCNN14Att,
                        threshold=0.5):
    SR = SAMPLE_RATE
    PERIOD = 30
    audios = []
    y = clip.astype(np.float32)
    len_y = len(y)
    start = 0
    end = PERIOD * SR
    while True:
        y_batch = y[start:end].astype(np.float32)
        if len(y_batch) != PERIOD * SR:
            y_pad = np.zeros(PERIOD * SR, dtype=np.float32)
            y_pad[:len(y_batch)] = y_batch
            audios.append(y_pad)
            break
        start = end
        end += PERIOD * SR
        audios.append(y_batch)
        
    array = np.asarray(audios)
    tensors = torch.from_numpy(array)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model.eval()
    estimated_event_list = []
    global_time = 0.0
    file_name = test_df["file_name"].values[0]
    for image in progress_bar(tensors):
        image = image.view(1, image.size(0))
        image = image.to(device)

        with torch.no_grad():
            prediction = model(image)
            framewise_outputs = prediction["framewise_output"].detach(
                ).cpu().numpy()[0]
                
        thresholded = framewise_outputs >= threshold

        for target_idx in range(thresholded.shape[1]):
            if thresholded[:, target_idx].mean() == 0:
                pass
            else:
                detected = np.argwhere(thresholded[:, target_idx]).reshape(-1)
                head_idx = 0
                tail_idx = 0
                while True:
                    if (tail_idx + 1 == len(detected)) or (
                            detected[tail_idx + 1] - 
                            detected[tail_idx] != 1):
                        onset = 0.01 * detected[
                            head_idx] + global_time
                        offset = 0.01 * detected[
                            tail_idx] + global_time
                        onset_idx = detected[head_idx]
                        offset_idx = detected[tail_idx]
                        max_confidence = framewise_outputs[
                            onset_idx:offset_idx, target_idx].max()
                        mean_confidence = framewise_outputs[
                            onset_idx:offset_idx, target_idx].mean()
                        estimated_event = {
                            "file_name": file_name,
                            "event_label_pred": config.id2classes[target_idx],
                            "onset_pred": onset,
                            "offset_pred": offset,
                            "max_confidence": max_confidence,
                            "mean_confidence": mean_confidence
                        }
                        estimated_event_list.append(estimated_event)
                        head_idx = tail_idx + 1
                        tail_idx = tail_idx + 1
                        if head_idx >= len(detected):
                            break
                    else:
                        tail_idx += 1
        global_time += PERIOD
        
    prediction_df = pd.DataFrame(estimated_event_list)
    return prediction_df

In [288]:
with open("../confs/default.yaml", "r") as f:
        configs = yaml.safe_load(f)

In [289]:
SAMPLE_RATE = configs["data"]["fs"]
N_FFT = configs["feats"]["n_window"]
WIN_LENGTH = configs["feats"]["n_window"]
HOP_LENGTH = configs["feats"]["hop_length"]
F_MIN = configs["feats"]["f_min"]
F_MAX = configs["feats"]["f_max"]
N_MELS = configs["feats"]["n_mels"]
WINDOW_FN = torch.hamming_window
WKWARGS = {"periodic": False}
POWER = 1
NUM_SAMPLES = SAMPLE_RATE
LEARNING_RATE = configs["opt"]["lr"]
BATCH_SIZE = 8

In [290]:
#frame_length_in_seconds
frame_length_sec = HOP_LENGTH / SAMPLE_RATE

In [291]:
test_df = pd.read_csv("../" + configs["data"]["val_tsv"], sep = "\t")

In [292]:
test_df

Unnamed: 0,filename,onset,offset,event_label
0,Y00pbt6aJV8Y_350.000_360.000.wav,0.000,9.971,Vacuum_cleaner
1,Y00pK0GMmE9s_70.000_80.000.wav,0.000,10.000,Vacuum_cleaner
2,Y02sD1KJeoGA_50.000_60.000.wav,0.000,10.000,Frying
3,Y0bjUq9XMMmQ_30.000_40.000.wav,0.000,10.000,Frying
4,Y0cH_NlhhMAs_30.000_40.000.wav,1.710,6.005,Cat
...,...,...,...,...
4234,Yb8GxUkjLSUY_628.000_638.000.wav,4.772,5.228,Speech
4235,Yb8GxUkjLSUY_628.000_638.000.wav,5.606,6.360,Speech
4236,Yb8GxUkjLSUY_628.000_638.000.wav,7.644,8.220,Speech
4237,Yb8GxUkjLSUY_628.000_638.000.wav,8.524,9.391,Speech


In [13]:
trainer = pl.Trainer(
        deterministic=False,
        gpus = 0, 
        max_epochs = config.max_epoch,   
        sync_batchnorm = True,
        num_sanity_val_steps = 0,
        # resume_from_checkpoint = config.resume_checkpoint,
        gradient_clip_val=1.0,
    )

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [14]:
sed_model = HTSAT_Swin_Transformer(
        spec_size=config.htsat_spec_size,
        patch_size=config.htsat_patch_size,
        in_chans=1,
        num_classes=config.classes_num,
        window_size=config.htsat_window_size,
        config = config,
        depths = config.htsat_depth,
        embed_dim = config.htsat_dim,
        patch_stride=config.htsat_stride,
        num_heads=config.htsat_num_head
    )

In [15]:
model = SEDWrapper(
        sed_model = sed_model, 
        config = config,
        dataset = eval_dataset
    )

In [25]:
len(eval_dataset)

5

In [26]:
audioset_data = data_prep(train_dataset, val_dataset, eval_dataset)

In [27]:
trainer.test(model, audioset_data.test_dataloader())

Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         dprime                     0.0
           f1                       0.0
           mAP             0.0043327008222643895
          mAUC                      0.0
        precision                   0.0
         recall                     0.0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'mAP': 0.0043327008222643895,
  'mAUC': 0.0,
  'dprime': 0.0,
  'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0}]

In [293]:
test_df.drop(test_df.columns[1:3], axis=1, inplace=True)

In [294]:
test_df = test_df.dropna()

In [295]:
reduced_test_df = test_df.sample(34)

In [296]:
TEST_AUDIO_DIR = "../" + configs["data"]["val_folder"]

In [297]:
weights_path = "./logs/panns_pred/version_0/checkpoints/epoch=4-step=50.ckpt"

In [298]:
os.path.exists(weights_path)

True

In [301]:
prediction_df = prediction(test_df=reduced_test_df,
                           test_audio=TEST_AUDIO_DIR,
                           model_config=model_config,
                           weights_path=weights_path,
                           threshold=0.5, SR = SAMPLE_RATE)

UndefinedVariableError: name 'audio_id' is not defined

In [None]:
a = torch.load(weights_path, map_location = "cpu")

In [122]:
a["state_dict"] 

OrderedDict([('sed_model.spectrogram_extractor.stft.conv_real.weight',
              tensor([[[ 0.0000e+00,  9.4124e-06,  3.7649e-05,  ...,  8.4709e-05,
                         3.7649e-05,  9.4124e-06]],
              
                      [[ 0.0000e+00,  9.4122e-06,  3.7646e-05,  ...,  8.4695e-05,
                         3.7646e-05,  9.4122e-06]],
              
                      [[ 0.0000e+00,  9.4117e-06,  3.7638e-05,  ...,  8.4652e-05,
                         3.7638e-05,  9.4117e-06]],
              
                      ...,
              
                      [[ 0.0000e+00, -9.4117e-06,  3.7638e-05,  ..., -8.4652e-05,
                         3.7638e-05, -9.4117e-06]],
              
                      [[ 0.0000e+00, -9.4122e-06,  3.7646e-05,  ..., -8.4695e-05,
                         3.7646e-05, -9.4122e-06]],
              
                      [[ 0.0000e+00, -9.4124e-06,  3.7649e-05,  ..., -8.4709e-05,
                         3.7649e-05, -9.4124e-06]]])),
     

In [123]:
prefix_to_remove = "sed_model."

# Create a new dictionary with renamed keys
renamed_dict = {k.replace(prefix_to_remove, ""): v for k, v in a["state_dict"].items()}


In [125]:
renamed_dict.keys()

dict_keys(['spectrogram_extractor.stft.conv_real.weight', 'spectrogram_extractor.stft.conv_imag.weight', 'logmel_extractor.melW', 'bn0.weight', 'bn0.bias', 'bn0.running_mean', 'bn0.running_var', 'bn0.num_batches_tracked', 'conv_block1.conv1.weight', 'conv_block1.conv2.weight', 'conv_block1.bn1.weight', 'conv_block1.bn1.bias', 'conv_block1.bn1.running_mean', 'conv_block1.bn1.running_var', 'conv_block1.bn1.num_batches_tracked', 'conv_block1.bn2.weight', 'conv_block1.bn2.bias', 'conv_block1.bn2.running_mean', 'conv_block1.bn2.running_var', 'conv_block1.bn2.num_batches_tracked', 'conv_block2.conv1.weight', 'conv_block2.conv2.weight', 'conv_block2.bn1.weight', 'conv_block2.bn1.bias', 'conv_block2.bn1.running_mean', 'conv_block2.bn1.running_var', 'conv_block2.bn1.num_batches_tracked', 'conv_block2.bn2.weight', 'conv_block2.bn2.bias', 'conv_block2.bn2.running_mean', 'conv_block2.bn2.running_var', 'conv_block2.bn2.num_batches_tracked', 'conv_block3.conv1.weight', 'conv_block3.conv2.weight', 'c

In [53]:
trainer = pl.Trainer(
        deterministic=False,
        gpus = 0, 
        max_epochs = 1,   
        sync_batchnorm = True,
        num_sanity_val_steps = 0,
        # resume_from_checkpoint = config.resume_checkpoint,
        gradient_clip_val=1.0,
        logger=tb_logger,

    )

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [54]:
sed_model = HTSAT_Swin_Transformer(
        spec_size=config.htsat_spec_size,
        patch_size=config.htsat_patch_size,
        in_chans=1,
        num_classes=config.classes_num,
        window_size=config.htsat_window_size,
        config = config,
        depths = config.htsat_depth,
        embed_dim = config.htsat_dim,
        patch_stride=config.htsat_stride,
        num_heads=config.htsat_num_head
    )

In [55]:
model = SEDWrapper(
        sed_model = sed_model, 
        config = config,
        dataset = eval_dataset
    )

In [57]:
trainer.test(model_panns, audioset_data.test_dataloader())

Testing: 0it [00:00, ?it/s]

RuntimeError: Default process group has not been initialized, please make sure to call init_process_group.

In [None]:
trainer.fit(model, audioset_data)

In [None]:
trainer.validate(model=model, dataloaders=audioset_data.val_dataloader(), ckpt_path=None, verbose=True, datamodule=None)

In [None]:
!tensorboard --logdir=./logs


NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

E0802 02:29:14.676294 140266275452736 application.py:125] Failed to load plugin WhatIfToolPluginLoader.load; ignoring it.
Traceback (most recent call last):
  File "/home/unegi/anaconda3/envs/dcase2023/lib/python3.8/site-packages/tensorboard/backend/application.py", line 123, in TensorBoardWSGIApp
    plugin = loader.load(context)
  File "/home/unegi/anaconda3/envs/dcase2023/lib/python3.8/site-packages/tensorboard_plugin_wit/wit_plugin_loader.py", line 57, in load
    from tensorboard_plugin_wit.wit_plugin import WhatIfToolPlugin
  File "/home/unegi/anaconda3/envs/dcase2023/lib/python3.8/site-packages/tensorboard_plugin_wit/wit_plugin.py", line 40, in <module>
    from tensorboard_plugin_wit._utils import common_utils
  File "/home/unegi/anaconda3/envs/dcase2023/lib/python3.8/site-packages/tenso

In [None]:
first_batch = next(iter(audioset_data.train_dataloader()))

# 'first_batch' will be a tuple containing the data and labels
# You can access the individual elements as follows
audio_data, labels =  batch["waveform"], batch["target"]

# Do whatever you want with the audio_data and labels
print("Audio data shape:", audio_data.shape)
print("Labels:", labels.shape)

Audio data shape: torch.Size([2, 160000])
Labels: torch.Size([2, 527])


In [None]:
audioset_data.train_dataloader()

<torch.utils.data.dataloader.DataLoader at 0x7fb508ca98b0>

0


RuntimeError: stack expects each tensor to be equal size, but got [62045] at entry 0 and [43840] at entry 1

In [None]:
ckpt = torch.load(config.panns_pretrain_path, map_location="cpu")

In [None]:
ckpt

{'iteration': 780000,
 'model': OrderedDict([('spectrogram_extractor.stft.conv_real.weight',
               tensor([[[ 0.0000e+00,  9.4124e-06,  3.7649e-05,  ...,  8.4709e-05,
                          3.7649e-05,  9.4124e-06]],
               
                       [[ 0.0000e+00,  9.4122e-06,  3.7646e-05,  ...,  8.4695e-05,
                          3.7646e-05,  9.4122e-06]],
               
                       [[ 0.0000e+00,  9.4117e-06,  3.7638e-05,  ...,  8.4652e-05,
                          3.7638e-05,  9.4117e-06]],
               
                       ...,
               
                       [[ 0.0000e+00, -9.4117e-06,  3.7638e-05,  ..., -8.4652e-05,
                          3.7638e-05, -9.4117e-06]],
               
                       [[ 0.0000e+00, -9.4122e-06,  3.7646e-05,  ..., -8.4695e-05,
                          3.7646e-05, -9.4122e-06]],
               
                       [[ 0.0000e+00, -9.4124e-06,  3.7649e-05,  ..., -8.4709e-05,
                    

In [None]:
load_state_dict(ckpt["model"])

NameError: name 'load_state_dict' is not defined

In [None]:
for keys in ckpt["model"]:
    print(keys)

spectrogram_extractor.stft.conv_real.weight
spectrogram_extractor.stft.conv_imag.weight
logmel_extractor.melW
bn0.weight
bn0.bias
bn0.running_mean
bn0.running_var
bn0.num_batches_tracked
conv_block1.conv1.weight
conv_block1.conv2.weight
conv_block1.bn1.weight
conv_block1.bn1.bias
conv_block1.bn1.running_mean
conv_block1.bn1.running_var
conv_block1.bn1.num_batches_tracked
conv_block1.bn2.weight
conv_block1.bn2.bias
conv_block1.bn2.running_mean
conv_block1.bn2.running_var
conv_block1.bn2.num_batches_tracked
conv_block2.conv1.weight
conv_block2.conv2.weight
conv_block2.bn1.weight
conv_block2.bn1.bias
conv_block2.bn1.running_mean
conv_block2.bn1.running_var
conv_block2.bn1.num_batches_tracked
conv_block2.bn2.weight
conv_block2.bn2.bias
conv_block2.bn2.running_mean
conv_block2.bn2.running_var
conv_block2.bn2.num_batches_tracked
conv_block3.conv1.weight
conv_block3.conv2.weight
conv_block3.bn1.weight
conv_block3.bn1.bias
conv_block3.bn1.running_mean
conv_block3.bn1.running_var
conv_block3.bn