In [1]:
from transformers import AutoProcessor, ASTConfig, ASTModel, AutoFeatureExtractor, ASTForAudioClassification, TrainingArguments, Trainer

import torch
import torch.nn as nn
from datasets import load_dataset
import yaml
import torchaudio
import os

from transformers import AdamW
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import DataLoader
from tqdm import tqdm
import yaml
import numpy as np

In [2]:
model_checkpoint = "MIT/ast-finetuned-audioset-10-10-0.4593"


In [3]:
feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint, device_map='cpu')

In [4]:
from collections import OrderedDict


labels2id = OrderedDict(
    {
        "Alarm_bell_ringing": 0,
        "Blender": 1,
        "Cat": 2,
        "Dishes": 3,
        "Dog": 4,
        "Electric_shaver_toothbrush": 5,
        "Frying": 6,
        "Running_water": 7,
        "Speech": 8,
        "Vacuum_cleaner": 9,
    }
)

id2labels = {value: key for key, value in labels2id.items()}

In [5]:
#model.config.labels2id

In [6]:
with open("./confs/default.yaml", "r") as f:
        configs = yaml.safe_load(f)

In [7]:


SAMPLE_RATE = configs["data"]["fs"]
N_FFT = configs["feats"]["n_window"]
WIN_LENGTH = configs["feats"]["n_window"]
HOP_LENGTH = configs["feats"]["hop_length"]
F_MIN = configs["feats"]["f_min"]
F_MAX = configs["feats"]["f_max"]
N_MELS = configs["feats"]["n_mels"]
WINDOW_FN = torch.hamming_window
WKWARGS = {"periodic": False}
POWER = 1
NUM_SAMPLES = SAMPLE_RATE

config = ASTConfig(
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
    hidden_act='gelu',
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    initializer_range=0.02,
    layer_norm_eps=1e-12,
    patch_size=16,
    qkv_bias=True,
    frequency_stride=10,
    time_stride=10,
    max_length=1024,
    num_labels=len(labels2id)  # Number of sound event classes
)

mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate = SAMPLE_RATE,
        n_fft = N_FFT,
        hop_length= HOP_LENGTH,
        n_mels = N_MELS,
        win_length= WIN_LENGTH,
        f_min=F_MIN,
        f_max=F_MAX,
        window_fn=WINDOW_FN,
        wkwargs=WKWARGS,
        power=POWER
    )

config.num_labels = 11

In [8]:
def process_data(example, audio_file_dir, num_samples, transformation, labels2id, target_sample_rate=16000):
    # This function will do what your __getitem__ method does
    
    # Load audio
    signal, sr = torchaudio.load(os.path.join(audio_file_dir, example['filename']))
    label = example["event_label"]
    label_int = labels2id[label]
    # Resample if necessary
    resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
    signal = resampler(signal)
    
    # Mix down if necessary
    if signal.shape[0] > 1:
        signal = torch.mean(signal, dim = 0, keepdim = True)
    
    # Cut if necessary
    onset_frame = int(example['onset'] * target_sample_rate)
    offset_frame = int(example['offset'] * target_sample_rate)
    signal = signal[:, onset_frame:offset_frame]
    if signal.shape[1] > num_samples:
        signal = signal[:, :num_samples]
    
    # Pad if necessary
    length_signal = signal.shape[1]
    if length_signal < num_samples:
        num_missing_samples = num_samples - length_signal
        last_dim_padding = (0, num_missing_samples)
        signal = nn.functional.pad(signal, last_dim_padding)

    signal = signal.view(-1)
    #print(signal.shape)
    #print("#################")

    times = torch.tensor([onset_frame, offset_frame], dtype=torch.float32)
    # Apply transformation
    #signal = transformation(signal)
    #print(signal.shape)

    # Reshape and permute the input tensor
    #signal = signal.squeeze(0).to_dense().permute(2, 0, 1)
    
    example['input_values'] = np.array(signal)
    #example['input_values'] = torch.tensor(example['input_values'])
    example['label'] = torch.tensor(label_int)
    #example['label_int'] = torch.tensor(example['label_int'])
    #example['times'] = times
    #example['times'] = torch.tensor(example['times'])
    return example

In [9]:
dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")

dataset = dataset.sort("id")

sampling_rate = dataset.features["audio"].sampling_rate

Found cached dataset librispeech_asr_demo (/home/unegi/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr_demo/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b)


In [10]:
dataset[0]["audio"]["array"].shape

(93680,)

In [11]:
data_files = {"train": configs["data"]["synth_tsv"], "test": configs["data"]["synth_val_tsv"]}
trial_dataset = load_dataset("csv", data_files=data_files, sep = "\t")

Found cached dataset csv (/home/unegi/.cache/huggingface/datasets/csv/default-03475b778f293dce/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
dataset[0]["audio"]["array"].shape

(93680,)

In [13]:
dataset[0]["audio"]

{'path': '/home/unegi/.cache/huggingface/datasets/downloads/extracted/b1e597323d8b9a7257310b4aaaba1bc74facde05d42c9048752c990cbbd1d77b/dev_clean/1272/128104/1272-128104-0000.flac',
 'array': array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
        0.0010376 ]),
 'sampling_rate': 16000}

In [14]:
train_dataset = trial_dataset["train"].map(process_data, 
                                           fn_kwargs={'audio_file_dir': configs["data"]["synth_folder"], 
                                                      'num_samples': NUM_SAMPLES,
                                                       'transformation': mel_spectrogram,
                                                        'labels2id': labels2id,
                                                        'target_sample_rate':16000, 
                                                        })

columns_to_remove = ["onset", "offset", "event_label"]

train_dataset = train_dataset.remove_columns(columns_to_remove) 

Loading cached processed dataset at /home/unegi/.cache/huggingface/datasets/csv/default-03475b778f293dce/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-6048fb7f002aacf1.arrow


In [15]:
test_dataset = trial_dataset["test"].map(process_data, 
                                           fn_kwargs={'audio_file_dir': configs["data"]["synth_val_folder"], 
                                                      'num_samples': NUM_SAMPLES,
                                                       'transformation': mel_spectrogram,
                                                        'labels2id': labels2id,
                                                        'target_sample_rate':16000, 
                                                        })
test_dataset = test_dataset.remove_columns(columns_to_remove) 

Loading cached processed dataset at /home/unegi/.cache/huggingface/datasets/csv/default-03475b778f293dce/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-305c56cf965f6ddf.arrow


In [16]:
use_partial = True

In [17]:
#to check if everything works
if use_partial:
    partial_dataset = train_dataset.select(range(2))
    partial_eval_dataset = test_dataset.select(range(1))

In [18]:
def feature_function(examples):
    return feature_extractor(examples["input_values"],  sampling_rate=16000, return_tensors="pt")

tokenized_train_datasets = partial_dataset.map(feature_function, batched=True)
tokenized_val_datasets = partial_eval_dataset.map(feature_function, batched=True)

Loading cached processed dataset at /home/unegi/.cache/huggingface/datasets/csv/default-03475b778f293dce/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-0197f77b7d74d9a4.arrow
Loading cached processed dataset at /home/unegi/.cache/huggingface/datasets/csv/default-03475b778f293dce/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-54c309f33e818951.arrow


In [19]:
#tokenized_train_datasets = tokenized_train_datasets.remove_columns(["filename"]) 

In [20]:
#tokenized_val_datasets = tokenized_val_datasets.remove_columns(["filename"])

In [21]:
tokenized_train_datasets

Dataset({
    features: ['filename', 'input_values', 'label'],
    num_rows: 2
})

In [22]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")


In [23]:
model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

In [24]:
train_dataset = tokenized_train_datasets.rename_column("label", "labels")
valid_dataset = tokenized_val_datasets.rename_column("label", "labels")

In [25]:
train_dataset

Dataset({
    features: ['filename', 'input_values', 'labels'],
    num_rows: 2
})

In [26]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=4,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch"
)

In [27]:
trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset, eval_dataset=valid_dataset, compute_metrics=compute_metrics
)

[codecarbon INFO @ 16:19:33] [setup] RAM Tracking...
[codecarbon INFO @ 16:19:33] [setup] GPU Tracking...
[codecarbon INFO @ 16:19:33] No GPU found.
[codecarbon INFO @ 16:19:33] [setup] CPU Tracking...
[codecarbon ERROR @ 16:19:33] Unable to read Intel RAPL files for CPU power, we will use a constant for your CPU power. Please view https://github.com/mlco2/codecarbon/issues/244 for workarounds : [Errno 13] Permission denied: '/sys/class/powercap/intel-rapl/intel-rapl:0/energy_uj'
[codecarbon INFO @ 16:19:33] Tracking Intel CPU via RAPL interface
[codecarbon ERROR @ 16:19:35] Unable to read Intel RAPL files for CPU power, we will use a constant for your CPU power. Please view https://github.com/mlco2/codecarbon/issues/244 for workarounds : [Errno 13] Permission denied: '/sys/class/powercap/intel-rapl/intel-rapl:0/energy_uj'
[codecarbon INFO @ 16:19:35] >>> Tracker's metadata:
[codecarbon INFO @ 16:19:35]   Platform system: Linux-5.19.0-43-generic-x86_64-with-glibc2.10
[codecarbon INFO @

In [28]:
trainer.train()



  0%|          | 0/2 [00:00<?, ?it/s]

[codecarbon INFO @ 16:19:53] Energy consumed for RAM : 0.000012 kWh. RAM Power : 2.858745574951172 W
[codecarbon INFO @ 16:19:54] Energy consumed for all CPUs : 0.000000 kWh. All CPUs Power : 0.0 W
[codecarbon INFO @ 16:19:54] 0.000012 kWh of electricity used since the begining.
[codecarbon INFO @ 16:20:08] Energy consumed for RAM : 0.000024 kWh. RAM Power : 2.858745574951172 W
[codecarbon INFO @ 16:20:08] Energy consumed for all CPUs : 0.000000 kWh. All CPUs Power : 0.0 W
[codecarbon INFO @ 16:20:08] 0.000024 kWh of electricity used since the begining.


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.696477890014648, 'eval_accuracy': 0.0, 'eval_runtime': 3.82, 'eval_samples_per_second': 0.262, 'eval_steps_per_second': 0.262, 'epoch': 1.0}


[codecarbon INFO @ 16:20:23] Energy consumed for RAM : 0.000036 kWh. RAM Power : 2.858745574951172 W
[codecarbon INFO @ 16:20:23] Energy consumed for all CPUs : 0.000000 kWh. All CPUs Power : 0.0 W
[codecarbon INFO @ 16:20:23] 0.000036 kWh of electricity used since the begining.
[codecarbon INFO @ 16:20:38] Energy consumed for RAM : 0.000047 kWh. RAM Power : 2.858745574951172 W
[codecarbon INFO @ 16:20:39] Energy consumed for all CPUs : 0.000000 kWh. All CPUs Power : 0.0 W
[codecarbon INFO @ 16:20:39] 0.000047 kWh of electricity used since the begining.
[codecarbon INFO @ 16:20:53] Energy consumed for RAM : 0.000059 kWh. RAM Power : 2.858745574951172 W
[codecarbon INFO @ 16:20:53] Energy consumed for all CPUs : 0.000000 kWh. All CPUs Power : 0.0 W
[codecarbon INFO @ 16:20:53] 0.000059 kWh of electricity used since the begining.
[codecarbon INFO @ 16:21:08] Energy consumed for RAM : 0.000071 kWh. RAM Power : 2.858745574951172 W
[codecarbon INFO @ 16:21:08] Energy consumed for all CPUs :

  0%|          | 0/1 [00:00<?, ?it/s]

[codecarbon INFO @ 16:21:12] Energy consumed for RAM : 0.000074 kWh. RAM Power : 2.858745574951172 W
[codecarbon INFO @ 16:21:12] Energy consumed for all CPUs : 0.000000 kWh. All CPUs Power : 0.0 W
[codecarbon INFO @ 16:21:12] 0.000074 kWh of electricity used since the begining.


{'eval_loss': 6.6900858879089355, 'eval_accuracy': 0.0, 'eval_runtime': 3.0676, 'eval_samples_per_second': 0.326, 'eval_steps_per_second': 0.326, 'epoch': 2.0}
{'train_runtime': 93.5875, 'train_samples_per_second': 0.043, 'train_steps_per_second': 0.021, 'train_loss': 9.947755813598633, 'epoch': 2.0}


TrainOutput(global_step=2, training_loss=9.947755813598633, metrics={'train_runtime': 93.5875, 'train_samples_per_second': 0.043, 'train_steps_per_second': 0.021, 'train_loss': 9.947755813598633, 'epoch': 2.0})

In [29]:
trainer.evaluate()

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.6900858879089355,
 'eval_accuracy': 0.0,
 'eval_runtime': 3.1344,
 'eval_samples_per_second': 0.319,
 'eval_steps_per_second': 0.319,
 'epoch': 2.0}

In [30]:
# Get the training dataset
training_dataset = trainer.train_dataset

# Check the column names
column_names = training_dataset.column_names
print(column_names)

['filename', 'input_values', 'labels']
