<a href="https://colab.research.google.com/github/zubairr1/Voice-Recognition-Using-Transformers/blob/main/Voice_Rec_Using_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install pydub
!pip install pydub

# Ensure ffmpeg is available (only necessary on some platforms like Google Colab)
!apt-get install -y ffmpeg

from google.colab import drive
drive.mount('/content/drive')

!pip install accelerate -U
!pip install datasets

import os
os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = "false"

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
Mounted at /content/drive
Collecting accelerate
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_c

In [None]:
import torch
import torchaudio
import os
import numpy as np
from datasets import Dataset, load_metric
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import random
from collections import Counter
from sklearn.metrics import classification_report, accuracy_score
!pip install audiomentations
from audiomentations import Compose, AddGaussianNoise, PitchShift, Shift




In [None]:
def load_audio_files(audio_path, target_sr=16000, max_duration=20, max_files_per_singer=20):
    audio_data = []
    labels = []

    singer_folders = os.listdir(audio_path)

    for singer in singer_folders:
        singer_path = os.path.join(audio_path, singer)
        audio_files = os.listdir(singer_path)

        # Shuffle and limit the number of files per singer
        random.shuffle(audio_files)
        audio_files = audio_files[:max_files_per_singer]

        for audio_file in audio_files:
            file_path = os.path.join(singer_path, audio_file)

            # Load and resample audio using torchaudio
            waveform, sample_rate = torchaudio.load(file_path)
            if sample_rate != target_sr:
                waveform = torchaudio.functional.resample(waveform, sample_rate, target_sr)

            # Convert to mono if stereo
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)

            # Trim or pad to max_duration
            if waveform.shape[1] > max_duration * target_sr:
                waveform = waveform[:, :max_duration * target_sr]
            else:
                waveform = torch.nn.functional.pad(waveform, (0, max_duration * target_sr - waveform.shape[1]))

            # Normalize
            waveform = waveform / torch.max(torch.abs(waveform))

            # Convert to NumPy array and append
            audio_data.append(waveform.squeeze().numpy()) # Convert to NumPy array here
            labels.append(singer)

    # Stack the NumPy arrays into a single array
    return np.stack(audio_data), np.array(labels) # Stack the NumPy arrays

In [None]:
!pip install audiomentations

import audiomentations
from audiomentations import Compose, AddGaussianNoise, PitchShift, TimeStretch

def augment_audio(waveform, sr):
    augment = Compose([
        AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
        PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
        # Use TimeStretch instead of Shift, and adjust parameters accordingly
        TimeStretch(min_rate=0.9, max_rate=1.1, p=0.5),
    ])
    augmented = augment(samples=waveform, sample_rate=sr)
    return augmented



In [None]:
# Load and preprocess data
audio_path = '/content/drive/MyDrive/DATASET/'
audio_data, labels = load_audio_files(audio_path, max_duration=20, max_files_per_singer=20)

unique_labels = np.unique(labels)
print("Unique labels (singer names):", unique_labels)

# Create a mapping of labels to integers
label_to_id = {label: i for i, label in enumerate(unique_labels)}
id_to_label = {i: label for label, i in label_to_id.items()}

# Convert string labels to integer ids
label_ids = np.array([label_to_id[label] for label in labels])

# Split the data into training and validation sets
train_data, val_data, train_labels, val_labels = train_test_split(
    audio_data, label_ids, test_size=0.2, stratify=label_ids, random_state=42
)

Unique labels (singer names): ['SG' 'SPB' 'SRM']


In [None]:
# Create datasets with augmentation
def create_dataset_with_augmentation(data, labels):
    augmented_data = []
    augmented_labels = []
    for waveform, label in zip(data, labels):
        augmented_data.append(waveform)
        augmented_labels.append(label)
        augmented_waveform = augment_audio(waveform, 16000)
        augmented_data.append(augmented_waveform)
        augmented_labels.append(label)
    return Dataset.from_dict({"input_values": augmented_data, "label": augmented_labels})


In [None]:
import numpy as np
from datasets import Dataset

def create_dataset_with_augmentation(train_data, train_labels, sample_rate=16000):
    augmented_data = []
    augmented_labels = []

    for audio, label in zip(train_data, train_labels):
        # Convert audio to float32 if it's not already
        audio_float = audio.astype(np.float32)

        # Normalize the audio to the range [-1, 1]
        audio_normalized = audio_float / np.max(np.abs(audio_float))

        # Apply augmentation using the augment_audio function
        augmented_audio = augment_audio(audio_normalized, sample_rate)

        # Add both original and augmented audio to the dataset
        augmented_data.append(audio_normalized)
        augmented_labels.append(label)
        augmented_data.append(augmented_audio)
        augmented_labels.append(label)

    return Dataset.from_dict({"input_values": augmented_data, "label": augmented_labels})

# Use the modified function
train_dataset = create_dataset_with_augmentation(train_data, train_labels)

# The validation dataset creation remains the same
val_dataset = Dataset.from_dict({"input_values": val_data.tolist(), "label": val_labels.tolist()})

# Initialize feature extractor and model for audio classification
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
classification_model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base",
    num_labels=len(unique_labels),
    ignore_mismatched_sizes=True
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_dataset = create_dataset_with_augmentation(train_data, train_labels)
val_dataset = Dataset.from_dict({"input_values": val_data.tolist(), "label": val_labels.tolist()})

# Initialize feature extractor and model for audio classification
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
classification_model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base",
    num_labels=len(unique_labels),
    ignore_mismatched_sizes=True
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def data_collator(features):
    input_features = [feature["input_values"] for feature in features]
    label_ids = [feature["label"] for feature in features]

    inputs = feature_extractor(
        input_features,
        sampling_rate=16000,
        padding="max_length",
        max_length=int(16000 * 20),  # 20 seconds max length
        truncation=True,
        return_tensors="pt"
    )

    inputs["labels"] = torch.tensor(label_ids)
    return inputs

In [None]:
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    labels = eval_pred.label_ids
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": load_metric("f1").compute(predictions=predictions, references=labels, average="weighted")["f1"],
    }



In [None]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=50,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=True,
    dataloader_num_workers=2,
    max_grad_norm=0.5,
    gradient_checkpointing=True,
    save_total_limit=2,
    push_to_hub=False,
)



In [None]:
# Create Trainer
trainer = Trainer(
    model=classification_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=feature_extractor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()


  self.pid = os.fork()


Step,Training Loss,Validation Loss,Accuracy,F1
100,No log,0.914062,0.833333,0.822222


  "f1": load_metric("f1").compute(predictions=predictions, references=labels, average="weighted")["f1"],


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

  self.pid = os.fork()
  self.pid = os.fork()


Step,Training Loss,Validation Loss,Accuracy,F1
100,No log,0.914062,0.833333,0.822222
200,No log,0.250417,1.0,1.0
300,No log,0.051832,1.0,1.0


  self.pid = os.fork()
  self.pid = os.fork()


TrainOutput(global_step=300, training_loss=0.6019712320963542, metrics={'train_runtime': 1923.4577, 'train_samples_per_second': 2.496, 'train_steps_per_second': 0.156, 'total_flos': 8.71551101952e+17, 'train_loss': 0.6019712320963542, 'epoch': 50.0})

In [None]:
def test_multiple_samples(model, feature_extractor, audio_path, id_to_label, num_samples=100, max_duration=20):
    predictions = []
    true_labels = []

    singers = os.listdir(audio_path)
    samples_per_singer = num_samples // len(singers)

    for singer in singers:
        singer_path = os.path.join(audio_path, singer)
        audio_files = os.listdir(singer_path)

        for _ in range(samples_per_singer):
            audio_file = random.choice(audio_files)
            file_path = os.path.join(singer_path, audio_file)

            waveform, sample_rate = torchaudio.load(file_path)
            if sample_rate != 16000:
                waveform = torchaudio.functional.resample(waveform, sample_rate, 16000)
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)

            # Trim or pad to max_duration
            if waveform.shape[1] > max_duration * 16000:
                waveform = waveform[:, :max_duration * 16000]
            else:
                waveform = torch.nn.functional.pad(waveform, (0, max_duration * 16000 - waveform.shape[1]))

            waveform = waveform.squeeze().numpy()

            inputs = feature_extractor(waveform, sampling_rate=16000, return_tensors="pt", padding="max_length", max_length=int(16000 * max_duration), truncation=True)
            inputs = {k: v.to('cuda').half() for k, v in inputs.items()}

            with torch.no_grad():
                logits = model(**inputs).logits
            predicted_class_id = logits.argmax().item()
            predicted_label = id_to_label[predicted_class_id]

            predictions.append(predicted_label)
            true_labels.append(singer)

    accuracy = accuracy_score(true_labels, predictions)
    return predictions, true_labels, accuracy

In [None]:
from transformers import TrainerCallback

class LoggingCallback(TrainerCallback):
    def __init__(self, log_path):
        self.log_path = log_path

    def on_log(self, args, state, control, logs=None, **kwargs):
        _ = logs.pop("total_flos", None)
        if state.is_local_process_zero:
            with open(self.log_path, 'a') as f:
                f.write(json.dumps(logs) + '\n')

In [None]:
import json

# Create the callback
logging_callback = LoggingCallback("training_log.jsonl")

# Add the callback to the trainer
trainer.add_callback(logging_callback)

# Train the model
train_result = trainer.train()

# Print training and validation losses
with open("training_log.jsonl", 'r') as f:
    logs = [json.loads(line) for line in f]

train_losses = [log['loss'] for log in logs if 'loss' in log]
eval_losses = [log['eval_loss'] for log in logs if 'eval_loss' in log]

print("Training Losses:", train_losses)
print("Validation Losses:", eval_losses)

# Plot the losses
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Training Loss')


plt.xlabel('Steps')
plt.ylabel('Loss')
plt.title('Training and Validation Losses')
plt.legend()
plt.savefig('loss_plot.png')
plt.close()

# Clear CUDA cache
torch.cuda.empty_cache()

# Save the trained model
trainer.save_model("./trained_audio_classifier")

# Move model to GPU and convert to half precision
classification_model = classification_model.to('cuda').half()

# Run multi-sample test
predictions, true_labels, accuracy = test_multiple_samples(classification_model, feature_extractor, audio_path, id_to_label, num_samples=100, max_duration=20)

print(f"Test Accuracy: {accuracy}")
print("Prediction distribution:", Counter(predictions))
print("True label distribution:", Counter(true_labels))
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(true_labels, predictions))

You are adding a <class '__main__.LoggingCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
NotebookProgressCallback
LoggingCallback
LoggingCallback


Step,Training Loss,Validation Loss,Accuracy,F1
100,No log,0.116491,1.0,1.0
200,No log,0.055099,1.0,1.0
300,No log,0.022189,1.0,1.0


  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()


Training Losses: []
Validation Losses: [0.1676432341337204, 0.0795186385512352, 0.028695425018668175, 0.1164906844496727, 0.1164906844496727, 0.1164906844496727, 0.0550994873046875, 0.0550994873046875, 0.0550994873046875, 0.0221888218075037, 0.0221888218075037, 0.0221888218075037]
Test Accuracy: 0.9797979797979798
Prediction distribution: Counter({'SRM': 35, 'SG': 33, 'SPB': 31})
True label distribution: Counter({'SPB': 33, 'SRM': 33, 'SG': 33})
Accuracy: 0.9797979797979798

Classification Report:
              precision    recall  f1-score   support

          SG       1.00      1.00      1.00        33
         SPB       1.00      0.94      0.97        33
         SRM       0.94      1.00      0.97        33

    accuracy                           0.98        99
   macro avg       0.98      0.98      0.98        99
weighted avg       0.98      0.98      0.98        99



In [None]:
import os
import torch
import torchaudio
import numpy as np

# Test the model on a single sample
def test_single_sample(model, feature_extractor, audio_path, id_to_label, unique_labels):
    # Randomly select a singer and a song
    test_singer = random.choice(unique_labels)
    test_singer_path = os.path.join(audio_path, test_singer)
    test_audio_file = random.choice(os.listdir(test_singer_path))
    test_audio_path = os.path.join(test_singer_path, test_audio_file)

    print(f"\nTesting on a single sample:")
    print(f"Selected singer: {test_singer}")
    print(f"Selected audio file: {test_audio_file}")

    # Load and preprocess the audio
    waveform, sample_rate = torchaudio.load(test_audio_path)
    if sample_rate != 16000:
        waveform = torchaudio.functional.resample(waveform, sample_rate, 16000)
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    # Trim or pad to 20 seconds
    if waveform.shape[1] > 20 * 16000:
        waveform = waveform[:, :20 * 16000]
    else:
        waveform = torch.nn.functional.pad(waveform, (0, 20 * 16000 - waveform.shape[1]))

    waveform = waveform.squeeze().numpy()

    # Prepare input for the model
    test_input = feature_extractor(waveform, sampling_rate=16000, return_tensors="pt", padding="max_length", max_length=int(16000 * 20), truncation=True)
    test_input = {k: v.to('cuda').half() for k, v in test_input.items()}  # Move input to GPU and convert to half precision

    # Get model prediction
    with torch.no_grad():
        output = model(**test_input)
        logits = output.logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1)

    # Get predicted class and probability
    predicted_class_id = logits.argmax().item()
    predicted_label = id_to_label[predicted_class_id]
    predicted_probability = probabilities[0][predicted_class_id].item()

    print(f"\nPredicted singer: {predicted_label}")
    print(f"Confidence: {predicted_probability:.2%}")
    print(f"Actual singer: {test_singer}")
    print("\nTop 3 predictions:")

    # Get top 3 predictions
    top3_prob, top3_indices = torch.topk(probabilities, 3)
    for i in range(3):
        print(f"{id_to_label[top3_indices[0][i].item()]}: {top3_prob[0][i].item():.2%}")

    return predicted_label, test_singer

# Add this to your main code after training and before the multi-sample test
print("\n--- Single Sample Test ---")
predicted_label, actual_label = test_single_sample(classification_model, feature_extractor, audio_path, id_to_label, unique_labels)


--- Single Sample Test ---

Testing on a single sample:
Selected singer: SG
Selected audio file: CLIP-8.mp3

Predicted singer: SG
Confidence: 90.68%
Actual singer: SG

Top 3 predictions:
SG: 90.68%
SRM: 4.72%
SPB: 4.60%


In [None]:
 import json

# Save the label mapping
label_map_path = os.path.join("./trained_audio_classifier", "label_map.json")
with open(label_map_path, "w") as f:
    json.dump(id_to_label, f)

In [None]:
import torch
import torchaudio
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
import json
import os

# Load your trained model and feature extractor
model_name = "./trained_audio_classifier"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
model = AutoModelForAudioClassification.from_pretrained(model_name)

# Load the label mapping
label_map_path = os.path.join(model_name, "label_map.json")
with open(label_map_path, "r") as f:
    id_to_label = json.load(f)

# Function to preprocess and predict
def predict_singer(audio_file_path):
    # Load the audio file
    waveform, sample_rate = torchaudio.load(audio_file_path)

    # Resample if necessary (assuming your model expects 16kHz)
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(sample_rate, 16000)
        waveform = resampler(waveform)

    # Convert to mono if stereo
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    # Preprocess the audio
    inputs = feature_extractor(waveform.numpy()[0], sampling_rate=16000, return_tensors="pt")

    # Make prediction
    with torch.no_grad():
        logits = model(**inputs).logits

    # Get the predicted class
    predicted_class_id = logits.argmax().item()
    predicted_singer = id_to_label[str(predicted_class_id)]

    return predicted_singer

# Directory containing the input MP3 files
input_directory = "/content/drive/MyDrive/INPUT"

# Get all MP3 files in the input directory
mp3_files = [f for f in os.listdir(input_directory) if f.endswith('.mp3')]

# Process each MP3 file
for mp3_file in mp3_files:
    audio_file_path = os.path.join(input_directory, mp3_file)
    try:
        predicted_singer = predict_singer(audio_file_path)
        print(f"The predicted singer for {mp3_file} is: {predicted_singer}")
    except Exception as e:
        print(f"Error processing {mp3_file}: {str(e)}")

print("Classification complete.")

The predicted singer for SG.mp3 is: SG
The predicted singer for SPB.mp3 is: SG
The predicted singer for SM.mp3 is: SRM
Classification complete.


In [None]:
import torch
import torchaudio
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
import json
import os

# Load your trained model and feature extractor
model_name = "./trained_audio_classifier"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
model = AutoModelForAudioClassification.from_pretrained(model_name)

# Load the label mapping
label_map_path = os.path.join(model_name, "label_map.json")
with open(label_map_path, "r") as f:
    id_to_label = json.load(f)

# Function to preprocess and predict
def predict_singer(audio_file_path):
    # Load the audio file
    waveform, sample_rate = torchaudio.load(audio_file_path)

    # Resample if necessary (assuming your model expects 16kHz)
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(sample_rate, 16000)
        waveform = resampler(waveform)

    # Convert to mono if stereo
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    # Preprocess the audio
    inputs = feature_extractor(waveform.numpy()[0], sampling_rate=16000, return_tensors="pt")

    # Make prediction
    with torch.no_grad():
        logits = model(**inputs).logits

    # Get the predicted class
    predicted_class_id = logits.argmax().item()
    predicted_singer = id_to_label[str(predicted_class_id)]

    return predicted_singer

# Directory containing the input MP3 files
input_directory = "/content/drive/MyDrive/INPUT"

# Get all MP3 files in the input directory
mp3_files = [f for f in os.listdir(input_directory) if f.endswith('.mp3')]

# Ensure we have exactly 9 MP3 files
if len(mp3_files) != 9:
    print(f"Warning: Expected 9 MP3 files, but found {len(mp3_files)} files.")

# Process each MP3 file
for mp3_file in mp3_files:
    audio_file_path = os.path.join(input_directory, mp3_file)
    try:
        predicted_singer = predict_singer(audio_file_path)
        print(f"The predicted singer for {mp3_file} is: {predicted_singer}")
    except Exception as e:
        print(f"Error processing {mp3_file}: {str(e)}")

print("Classification complete.")

The predicted singer for SG.mp3 is: SG
The predicted singer for SPB.mp3 is: SG
The predicted singer for SM.mp3 is: SRM
Classification complete.


In [None]:
import torch
import torchaudio
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification

# Load your trained model and feature extractor
model_name = "./trained_audio_classifier"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
model = AutoModelForAudioClassification.from_pretrained(model_name)

# Function to preprocess and predict
def predict_singer(audio_file_path):
    # Load the audio file
    waveform, sample_rate = torchaudio.load(audio_file_path)

    # Resample if necessary (assuming your model expects 16kHz)
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(sample_rate, 16000)
        waveform = resampler(waveform)

    # Convert to mono if stereo
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    # Preprocess the audio
    inputs = feature_extractor(waveform.numpy()[0], sampling_rate=16000, return_tensors="pt")

    # Make prediction
    with torch.no_grad():
        logits = model(**inputs).logits

    # Get the predicted class
    predicted_class_id = logits.argmax().item()
# Map the class ID to singer name (you'll need to create this mapping)
    singer_names = {0:"SP BALASUBRAMANYUM", 1:"SHANKAR MAHADEVAN", 2:"SHREYA GHOSHAL"}  # Replace with your actual singer names
    predicted_singer = singer_names[predicted_class_id]

    return predicted_singer

# Use the function
audio_file_path = "/content/drive/MyDrive/INPUT/sg3.mp3"
predicted_singer = predict_singer(audio_file_path)
print(f"The predicted singer is: {predicted_singer}")

RuntimeError: Failed to open the input "/content/drive/MyDrive/INPUT/sg3.mp3" (No such file or directory).
Exception raised from get_input_format_context at /__w/audio/audio/pytorch/audio/src/libtorio/ffmpeg/stream_reader/stream_reader.cpp:42 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7b99abecf897 in /usr/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::string const&) + 0x64 (0x7b99abe7fb25 in /usr/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #2: <unknown function> + 0x42334 (0x7b99aaacb334 in /usr/local/lib/python3.10/dist-packages/torio/lib/libtorio_ffmpeg4.so)
frame #3: torio::io::StreamingMediaDecoder::StreamingMediaDecoder(std::string const&, std::optional<std::string> const&, std::optional<std::map<std::string, std::string, std::less<std::string>, std::allocator<std::pair<std::string const, std::string> > > > const&) + 0x14 (0x7b99aaacdd34 in /usr/local/lib/python3.10/dist-packages/torio/lib/libtorio_ffmpeg4.so)
frame #4: <unknown function> + 0x3aa4e (0x7b98e2c83a4e in /usr/local/lib/python3.10/dist-packages/torio/lib/_torio_ffmpeg4.so)
frame #5: <unknown function> + 0x32617 (0x7b98e2c7b617 in /usr/local/lib/python3.10/dist-packages/torio/lib/_torio_ffmpeg4.so)
frame #6: <unknown function> + 0x15ac9e (0x59ae9ffb6c9e in /usr/bin/python3)
frame #7: _PyObject_MakeTpCall + 0x25b (0x59ae9ffad3cb in /usr/bin/python3)
frame #8: <unknown function> + 0x169540 (0x59ae9ffc5540 in /usr/bin/python3)
frame #9: <unknown function> + 0x165c87 (0x59ae9ffc1c87 in /usr/bin/python3)
frame #10: <unknown function> + 0x15177b (0x59ae9ffad77b in /usr/bin/python3)
frame #11: <unknown function> + 0xf6cb (0x7b99cc0296cb in /usr/local/lib/python3.10/dist-packages/torchaudio/lib/_torchaudio.so)
frame #12: _PyObject_MakeTpCall + 0x25b (0x59ae9ffad3cb in /usr/bin/python3)
frame #13: _PyEval_EvalFrameDefault + 0x6e5b (0x59ae9ffa5fab in /usr/bin/python3)
frame #14: _PyObject_FastCallDictTstate + 0xc4 (0x59ae9ffac564 in /usr/bin/python3)
frame #15: <unknown function> + 0x165664 (0x59ae9ffc1664 in /usr/bin/python3)
frame #16: _PyObject_MakeTpCall + 0x1fc (0x59ae9ffad36c in /usr/bin/python3)
frame #17: _PyEval_EvalFrameDefault + 0x6e5b (0x59ae9ffa5fab in /usr/bin/python3)
frame #18: _PyFunction_Vectorcall + 0x7c (0x59ae9ffb759c in /usr/bin/python3)
frame #19: _PyEval_EvalFrameDefault + 0x6d7 (0x59ae9ff9f827 in /usr/bin/python3)
frame #20: _PyFunction_Vectorcall + 0x7c (0x59ae9ffb759c in /usr/bin/python3)
frame #21: _PyEval_EvalFrameDefault + 0x644a (0x59ae9ffa559a in /usr/bin/python3)
frame #22: _PyFunction_Vectorcall + 0x7c (0x59ae9ffb759c in /usr/bin/python3)
frame #23: _PyEval_EvalFrameDefault + 0x644a (0x59ae9ffa559a in /usr/bin/python3)
frame #24: _PyFunction_Vectorcall + 0x7c (0x59ae9ffb759c in /usr/bin/python3)
frame #25: _PyEval_EvalFrameDefault + 0x6d7 (0x59ae9ff9f827 in /usr/bin/python3)
frame #26: <unknown function> + 0x13ff96 (0x59ae9ff9bf96 in /usr/bin/python3)
frame #27: PyEval_EvalCode + 0x86 (0x59aea0091c66 in /usr/bin/python3)
frame #28: <unknown function> + 0x23b81d (0x59aea009781d in /usr/bin/python3)
frame #29: <unknown function> + 0x15b7f9 (0x59ae9ffb77f9 in /usr/bin/python3)
frame #30: _PyEval_EvalFrameDefault + 0x6d7 (0x59ae9ff9f827 in /usr/bin/python3)
frame #31: <unknown function> + 0x178890 (0x59ae9ffd4890 in /usr/bin/python3)
frame #32: _PyEval_EvalFrameDefault + 0x286f (0x59ae9ffa19bf in /usr/bin/python3)
frame #33: <unknown function> + 0x178890 (0x59ae9ffd4890 in /usr/bin/python3)
frame #34: _PyEval_EvalFrameDefault + 0x286f (0x59ae9ffa19bf in /usr/bin/python3)
frame #35: <unknown function> + 0x178890 (0x59ae9ffd4890 in /usr/bin/python3)
frame #36: <unknown function> + 0x25619f (0x59aea00b219f in /usr/bin/python3)
frame #37: <unknown function> + 0x166eca (0x59ae9ffc2eca in /usr/bin/python3)
frame #38: _PyEval_EvalFrameDefault + 0x81e (0x59ae9ff9f96e in /usr/bin/python3)
frame #39: _PyFunction_Vectorcall + 0x7c (0x59ae9ffb759c in /usr/bin/python3)
frame #40: _PyEval_EvalFrameDefault + 0x6d7 (0x59ae9ff9f827 in /usr/bin/python3)
frame #41: _PyFunction_Vectorcall + 0x7c (0x59ae9ffb759c in /usr/bin/python3)
frame #42: _PyEval_EvalFrameDefault + 0x81e (0x59ae9ff9f96e in /usr/bin/python3)
frame #43: <unknown function> + 0x169111 (0x59ae9ffc5111 in /usr/bin/python3)
frame #44: PyObject_Call + 0x122 (0x59ae9ffc5db2 in /usr/bin/python3)
frame #45: _PyEval_EvalFrameDefault + 0x294d (0x59ae9ffa1a9d in /usr/bin/python3)
frame #46: <unknown function> + 0x169111 (0x59ae9ffc5111 in /usr/bin/python3)
frame #47: _PyEval_EvalFrameDefault + 0x1a27 (0x59ae9ffa0b77 in /usr/bin/python3)
frame #48: <unknown function> + 0x2015e5 (0x59aea005d5e5 in /usr/bin/python3)
frame #49: <unknown function> + 0x15b7f9 (0x59ae9ffb77f9 in /usr/bin/python3)
frame #50: <unknown function> + 0x2375b5 (0x59aea00935b5 in /usr/bin/python3)
frame #51: <unknown function> + 0x2b4142 (0x59aea0110142 in /usr/bin/python3)
frame #52: <unknown function> + 0x14e2eb (0x59ae9ffaa2eb in /usr/bin/python3)
frame #53: _PyEval_EvalFrameDefault + 0x6d7 (0x59ae9ff9f827 in /usr/bin/python3)
frame #54: _PyFunction_Vectorcall + 0x7c (0x59ae9ffb759c in /usr/bin/python3)
frame #55: _PyEval_EvalFrameDefault + 0x81e (0x59ae9ff9f96e in /usr/bin/python3)
frame #56: <unknown function> + 0x2015e5 (0x59aea005d5e5 in /usr/bin/python3)
frame #57: <unknown function> + 0x15b7f9 (0x59ae9ffb77f9 in /usr/bin/python3)
frame #58: <unknown function> + 0x2375b5 (0x59aea00935b5 in /usr/bin/python3)
frame #59: <unknown function> + 0x2b4142 (0x59aea0110142 in /usr/bin/python3)
frame #60: <unknown function> + 0x14e2eb (0x59ae9ffaa2eb in /usr/bin/python3)
frame #61: _PyEval_EvalFrameDefault + 0x6d7 (0x59ae9ff9f827 in /usr/bin/python3)
frame #62: <unknown function> + 0x169111 (0x59ae9ffc5111 in /usr/bin/python3)
frame #63: _PyEval_EvalFrameDefault + 0x6d7 (0x59ae9ff9f827 in /usr/bin/python3)


In [None]:
import torch
import torchaudio
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
import os
import torch.nn.functional as F

# Load the trained model and feature extractor
model_name = "./trained_audio_classifier"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
model = AutoModelForAudioClassification.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def predict_singer(audio_file_path, confidence_threshold=0.5):
    try:
        # Load the audio file
        waveform, sample_rate = torchaudio.load(audio_file_path)

        # Resample if necessary (assuming 16kHz is required)
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(sample_rate, 16000)
            waveform = resampler(waveform)

        # Convert to mono if stereo
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)

        # Normalize
        waveform = waveform / torch.max(torch.abs(waveform))

        # Preprocess the audio
        inputs = feature_extractor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt", padding=True)

        # Move inputs to the same device as the model
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Make prediction
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits

        # Apply softmax to get probabilities
        probs = F.softmax(logits, dim=1)

        # Get the predicted class and its probability
        predicted_class_id = logits.argmax().item()
        predicted_prob = probs[0][predicted_class_id].item()

        # Map the class ID to singer name
        id_to_label = {0: "SP BALASUBRAMANYUM", 1: "SHANKAR MAHADEVAN", 2: "SHREYA GHOSHAL"}
        predicted_singer = id_to_label[predicted_class_id]

        # Print probabilities for all classes
        print(f"Probabilities for {os.path.basename(audio_file_path)}:")
        for i, prob in enumerate(probs[0]):
            print(f"{id_to_label[i]}: {prob.item():.4f}")

        # Check if the prediction meets the confidence threshold
        if predicted_prob >= confidence_threshold:
            return predicted_singer, predicted_prob
        else:
            return "Uncertain", predicted_prob

    except Exception as e:
        print(f"Error processing {audio_file_path}: {str(e)}")
        return None, None

# Process all audio files in the input directory
input_directory = "/content/drive/MyDrive/INPUT"

# List to store valid audio files
valid_audio_files = []

# Collect valid audio files
for filename in os.listdir(input_directory):
    if filename.endswith((".mp3", ".wav", ".ogg")):  # Add or remove file extensions as needed
        valid_audio_files.append(filename)

# Ensure we process only 9 files
num_files_to_process = min(9, len(valid_audio_files))

print(f"Processing {num_files_to_process} audio files:")

for i in range(num_files_to_process):
    filename = valid_audio_files[i]
    audio_file_path = os.path.join(input_directory, filename)
    predicted_singer, confidence = predict_singer(audio_file_path)
    if predicted_singer:
        print(f"The predicted singer for {filename} is: {predicted_singer} (Confidence: {confidence:.4f})")
    print()  # Add a blank line for readability

print("\nPrediction completed for all audio files.")

Processing 3 audio files:
Probabilities for SG.mp3:
SP BALASUBRAMANYUM: 0.9072
SHANKAR MAHADEVAN: 0.0450
SHREYA GHOSHAL: 0.0478
The predicted singer for SG.mp3 is: SP BALASUBRAMANYUM (Confidence: 0.9072)

Probabilities for SPB.mp3:
SP BALASUBRAMANYUM: 0.8574
SHANKAR MAHADEVAN: 0.0609
SHREYA GHOSHAL: 0.0817
The predicted singer for SPB.mp3 is: SP BALASUBRAMANYUM (Confidence: 0.8574)

Probabilities for SM.mp3:
SP BALASUBRAMANYUM: 0.0533
SHANKAR MAHADEVAN: 0.0528
SHREYA GHOSHAL: 0.8938
The predicted singer for SM.mp3 is: SHREYA GHOSHAL (Confidence: 0.8938)


Prediction completed for all audio files.
