<a href="https://colab.research.google.com/github/tummalapallimurali/GenAI/blob/main/Fine_Tuning_using_AST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers[torch] datasets[audio] audiomentations evaluate



## 1. Load Your Data in the Correct Format

# Pre-process the audio

In [2]:
from datasets import load_dataset, Dataset, Audio, ClassLabel, Features

# Load the esc50 dataset
esc50_dataset = load_dataset("ashraq/esc50", split="train")

#

class_labels = ClassLabel(names = ["dog","chirping_birds"])

features = Features({
     "audio": Audio(),
     "labels": class_labels # The key here is "label"
 })

dataset = Dataset.from_dict({
    "audio": [esc50_dataset[0]["audio"],esc50_dataset[1]["audio"]],
    "labels": [0,1]
},features= features)

print(dataset[0])

Repo card metadata block was not found. Setting CardData to empty.


{'audio': {'path': None, 'array': array([0., 0., 0., ..., 0., 0., 0.]), 'sampling_rate': 44100}, 'labels': 0}


In [3]:
esc50_dataset[1]

{'filename': '1-100038-A-14.wav',
 'fold': 1,
 'target': 14,
 'category': 'chirping_birds',
 'esc10': False,
 'src_file': 100038,
 'take': 'A',
 'audio': {'path': None,
  'array': array([-0.01184082, -0.10336304, -0.14141846, ...,  0.06985474,
          0.04049683,  0.00274658]),
  'sampling_rate': 44100}}

In [4]:
import numpy as np
from datasets import Audio, ClassLabel

# get target value - class name mappings
df = esc50_dataset.select_columns(["target", "category"]).to_pandas()
class_names = df.iloc[np.unique(df["target"], return_index=True)[1]]["category"].to_list()

# cast target and audio column
esc50_dataset = esc50_dataset.cast_column("target", ClassLabel(names=class_names))
esc50_dataset = esc50_dataset.cast_column("audio", Audio(sampling_rate=16000))

# rename the target feature
esc50_dataset = esc50_dataset.rename_column("target", "labels")
num_labels = len(np.unique(esc50_dataset["labels"]))

AST Model Input

In [5]:
from transformers import ASTFeatureExtractor

# we define which pretrained model we want to use and instantiate a feature extractor
pretrained_model = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = ASTFeatureExtractor.from_pretrained(pretrained_model)

# we save model input name and sampling rate for later use
model_input_name = feature_extractor.model_input_names[0]  # key -> 'input_values'
SAMPLING_RATE = feature_extractor.sampling_rate

In [6]:
# enumerate data set and print
for i, sample in enumerate(dataset):
    print(i, sample)

0 {'audio': {'path': None, 'array': array([0., 0., 0., ..., 0., 0., 0.]), 'sampling_rate': 44100}, 'labels': 0}
1 {'audio': {'path': None, 'array': array([-0.01184082, -0.10336304, -0.14141846, ...,  0.06985474,
        0.04049683,  0.00274658]), 'sampling_rate': 44100}, 'labels': 1}


In [7]:
print(dataset.column_names)


['audio', 'labels']


In [9]:
import torch

# Preprocessing function
def preprocess_audio(batch):
    # Handle missing labels gracefully
    if "labels" in batch:
        labels = batch["labels"]
    else:
        labels = None  # Or handle this case as needed

    wavs = [audio["array"] for audio in batch["input_values"]]
    inputs = feature_extractor(wavs, sampling_rate=SAMPLING_RATE, return_tensors="pt")

    # Prepare output
    output_batch = {model_input_name: inputs.get(model_input_name)}

    # Add labels if available
    if labels is not None:
        output_batch["labels"] = list(labels)

    return output_batch

# Apply transformation to the dataset
dataset = dataset.rename_column("audio", "input_values")
dataset.set_transform(preprocess_audio, output_all_columns=True)

# Calculate mean and std for normalization (ensure labels exist if necessary)
feature_extractor.do_normalize = False
mean = []
std = []

for i, batch in enumerate(dataset):
    cur_mean = torch.mean(batch[model_input_name])
    cur_std = torch.std(batch[model_input_name])
    mean.append(cur_mean)
    std.append(cur_std)

# Update normalization parameters
feature_extractor.mean = np.mean(mean)
feature_extractor.std = np.mean(std)
feature_extractor.do_normalize = True


In [None]:
# print feature extractor
dataset[1]

In [10]:
dataset = dataset.train_test_split(test_size= 0.2, shuffle=True, seed=0)

In [11]:
# add audioaugmentations

from audiomentations import Compose, AddGaussianSNR, GainTransition, Gain, ClippingDistortion, TimeStretch, PitchShift

audio_augmentations = Compose([
    AddGaussianSNR(min_snr_db=10, max_snr_db=20),
    Gain(min_gain_db=-6, max_gain_db=6),
    GainTransition(min_gain_db=-6, max_gain_db=6, min_duration=0.01, max_duration=0.3, duration_unit="fraction"),
    ClippingDistortion(min_percentile_threshold=0, max_percentile_threshold=30, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.2),
    PitchShift(min_semitones=-4, max_semitones=4),
], p=0.8, shuffle=True)

In [12]:
import torch

# Preprocessing function
def preprocess_audio_with_transforms(batch):
    # Handle missing labels gracefully
    if "labels" in batch:
        labels = batch["labels"]
    else:
        labels = None  # Or handle this case as needed

    wavs = [audio_augmentations(audio["array"],sample_rate=SAMPLING_RATE) for audio in batch["input_values"]]
    inputs = feature_extractor(wavs, sampling_rate=SAMPLING_RATE, return_tensors="pt")

    # Prepare output
    output_batch = {model_input_name: inputs.get(model_input_name)}

    # Add labels if available
    if labels is not None:
        output_batch["labels"] = list(labels)

    return output_batch

# Cast the audio column to the appropriate feature type and rename it
dataset = dataset.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))
#dataset = dataset.rename_column("audio", "input_values")

# with augmentations on the training set
dataset["train"].set_transform(preprocess_audio_with_transforms, output_all_columns=False)
# w/o augmentations on the test set
dataset["test"].set_transform(preprocess_audio, output_all_columns=False)


In [13]:
from transformers import ASTConfig, ASTForAudioClassification

# Load configuration from the pretrained model
config = ASTConfig.from_pretrained(pretrained_model)
label2id = {label: i for i, label in enumerate(class_names)}

# Update configuration with the number of labels in our dataset
config.num_labels = num_labels
config.label2id = label2id
config.id2label = {v: k for k, v in label2id.items()}

# Initialize the model with the updated configuration
model = ASTForAudioClassification.from_pretrained(pretrained_model, config=config, ignore_mismatched_sizes=True)
model.init_weights()

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([50]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([50, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

# Configure training run with TrainingArguments class
training_args = TrainingArguments(
    output_dir="./runs/ast_classifier",
    logging_dir="./logs/ast_classifier",
    report_to="tensorboard",
    learning_rate=5e-5,  # Learning rate
    push_to_hub=False,
    num_train_epochs=10,  # Number of epochs
    per_device_train_batch_size=8,  # Batch size per device
    eval_strategy="epoch",  # Evaluation strategy
    save_strategy="epoch",
    eval_steps=1,
    save_steps=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_strategy="steps",
    logging_steps=20,
)

# define evaluation metrics

import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")
recall = evaluate.load("recall")
precision = evaluate.load("precision")
f1 = evaluate.load("f1")
metric = evaluate.combine([accuracy, recall, precision, f1])

AVERAGE = "macro" if config.num_labels > 2 else "binary"


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
1,No log,4.445645,0.0,0.0,0.0,0.0
2,No log,5.137488,0.0,0.0,0.0,0.0
3,No log,5.931888,0.0,0.0,0.0,0.0
4,No log,6.838593,0.0,0.0,0.0,0.0
5,No log,7.609181,0.0,0.0,0.0,0.0
6,No log,8.184801,0.0,0.0,0.0,0.0
7,No log,8.591196,0.0,0.0,0.0,0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
config