# Swaras Extraction and Evaluation from Audio 🎶🎼
This notebook focuses on extracting swaras from an audio file and evaluating the accuracy of the extraction process.

**Key Components:**
**Dominant Frequency Extraction:**

Splits the audio into segments of a given window size.

Identifies the most prominent frequency in each segment.

**Swaras Conversion:**

Converts dominant frequencies into musical notes.

Maps notes to Indian classical swaras using note_to_svara_c().

**Filtering Conflicting Swaras:**

Resolves cases where multiple variants of a swara (e.g., R1 vs. R2) appear.

Retains only the most frequent variant.

**Evaluation Metrics:**

Compares extracted swaras with actual aarohanam (ascending scale).

Computes Precision, Recall, F1 Score, and Accuracy.

This notebook provides a structured approach to analyzing Indian classical music using signal processing and machine learning techniques.

In [None]:
import librosa
import numpy as np
from librosa import hz_to_note,note_to_svara_c
from scipy.io import wavfile

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import librosa
from collections import Counter

# Function to extract dominant swara frequencies from audio for each segment of specified window size
def extract_dominant_swara_frequencies(file_path, window_size):
    # Load the audio file
    y, sr = librosa.load(file_path, sr=None)

    # Calculate the number of samples per window (based on window size)
    samples_per_window = int(window_size * sr)

    # Initialize a list to store the dominant frequency for each window
    dominant_frequencies_per_window = []

    # Loop through the audio signal in increments of the specified window size
    for start in range(0, len(y), samples_per_window):
        # Extract a segment of the specified window size
        segment = y[start:start + samples_per_window]

        if len(segment) == 0:
            continue

        # Calculate the STFT for the segment
        stft = np.abs(librosa.stft(segment))
       # stft = np.abs(librosa.stft(segment, n_fft=2048, hop_length=512))

        # Get the average STFT across time for this segment
        avg_stft = np.mean(stft, axis=1)

        # Get the corresponding frequencies for each bin
        freqs = librosa.fft_frequencies(sr=sr)

        # Handle cases where avg_stft contains inf or NaN
        avg_stft = np.nan_to_num(avg_stft, nan=0.0, posinf=0.0, neginf=0.0)

        # Extract the frequency with the highest amplitude (dominant frequency)
        dominant_freq = freqs[np.argmax(avg_stft)]

        # Debugging: Print dominant frequency to ensure it's not inf or NaN
        print(f"Calculated dominant frequency: {dominant_freq}")

        # Store the dominant frequency for this segment if valid
        if np.isfinite(dominant_freq) and dominant_freq > 0:
            dominant_frequencies_per_window.append(dominant_freq)
        else:
            print(f"Skipping invalid dominant frequency: {dominant_freq}")

    return dominant_frequencies_per_window

# Dummy implementation for note_to_svara_c
def note_to_svara_c(notes):
    notes = librosa.note_to_svara_c(notes, Sa='C3', mela=16)
   # notes = librosa.note_to_svara_h(notes,Sa='F4')
    return notes

# Function to remove conflicting swaras (e.g., R1 vs R2) by keeping the most frequent one
def filter_swaras(swara_list):
    # Count occurrences of each swara
    swara_count = Counter(swara_list)

    # Group swaras by their base name (e.g., group all R1, R2, R3 as R)
    swara_groups = {}
    for swara in swara_count:
        base_swara = swara[0]  # Get the first character (e.g., R, G, M, etc.)
        if base_swara not in swara_groups:
            swara_groups[base_swara] = []
        swara_groups[base_swara].append(swara)

    # For each group (e.g., R), keep only the swara with the highest count
    final_swaras = []
    for base_swara, variants in swara_groups.items():
        # Find the variant with the maximum count
        max_swara = max(variants, key=lambda swara: swara_count[swara])
        final_swaras.append(max_swara)

    return final_swaras

# Merge function to extract dominant frequencies and convert to swaras
def extract_swaras_from_audio(file_path, window_size):
    # Step 1: Extract dominant swara frequencies for each segment of the specified window size
    dominant_swara_freqs = extract_dominant_swara_frequencies(file_path, window_size)

    # Step 2: Filter out any invalid frequencies (NaN, inf)
    valid_frequencies = [freq for freq in dominant_swara_freqs if np.isfinite(freq) and freq > 0]

    # Debugging: Print the valid frequencies
    print(f"Valid dominant frequencies: {valid_frequencies}")

    # Step 3: Convert each frequency to its corresponding musical note
    try:
        notes = [librosa.hz_to_note(freq) for freq in valid_frequencies]
        print(notes)
    except Exception as e:
        print(f"Error in hz_to_note conversion: {e}")
        return

    # Step 4: Apply note_to_svara_c to the list of notes to get swaras
    try:
        svara_notes = note_to_svara_c(notes)
    except Exception as e:
        print(f"Error in note_to_svara_c: {e}")
        return

    # Collect all swaras (dominant frequency -> note -> svara)
    all_swaras = []
    for segment, (freq, note, svara) in enumerate(zip(valid_frequencies, notes, svara_notes)):
        print(f"Segment {segment + 1}: Frequency -> {freq:.2f} Hz -> Note: {note} -> Svara: {svara}")
        all_swaras.append(svara)

    # Step 5: Filter out conflicting swaras (e.g., R1 vs R2) and retain only the most frequent variant
    final_swaras = filter_swaras(all_swaras)

    # Print the final list of swaras
    print(f"Final Swaras: {final_swaras}")

# Example usage
audio_file_path = '/content/drive/MyDrive/keys_songs/vasantha/vasa 50 C6.mp3'

# You can now dynamically change the window size
window_size = float(input("Enter the window size in seconds (e.g., 0.5, 1, 2): "))
extract_swaras_from_audio(audio_file_path, window_size)


Enter the window size in seconds (e.g., 0.5, 1, 2): 0.1
Calculated dominant frequency: 1055.126953125
Calculated dominant frequency: 1055.126953125
Calculated dominant frequency: 1055.126953125
Calculated dominant frequency: 4242.041015625
Calculated dominant frequency: 1055.126953125
Calculated dominant frequency: 1055.126953125
Calculated dominant frequency: 2110.25390625
Calculated dominant frequency: 1055.126953125
Calculated dominant frequency: 1055.126953125
Calculated dominant frequency: 1055.126953125
Calculated dominant frequency: 1055.126953125
Calculated dominant frequency: 1055.126953125
Calculated dominant frequency: 1313.525390625
Calculated dominant frequency: 1313.525390625
Calculated dominant frequency: 1313.525390625
Calculated dominant frequency: 1313.525390625
Calculated dominant frequency: 1313.525390625
Calculated dominant frequency: 1313.525390625
Calculated dominant frequency: 1313.525390625
Calculated dominant frequency: 1313.525390625
Calculated dominant frequ

In [None]:
def calculate_metrics(actual_swaras, extracted_swaras):
    # Convert to sets to allow easy comparison
    actual_set = set(actual_swaras)
    extracted_set = set(extracted_swaras)

    # True Positives (TP): Correctly identified swaras
    true_positives = actual_set.intersection(extracted_set)

    # False Positives (FP): Incorrectly identified swaras
    false_positives = extracted_set.difference(actual_set)

    # False Negatives (FN): Missed swaras
    false_negatives = actual_set.difference(extracted_set)

    # Precision: TP / (TP + FP)
    precision = len(true_positives) / (len(true_positives) + len(false_positives)) if (len(true_positives) + len(false_positives)) > 0 else 0

    # Recall: TP / (TP + FN)
    recall = len(true_positives) / (len(true_positives) + len(false_negatives)) if (len(true_positives) + len(false_negatives)) > 0 else 0

    # F1 Score: 2 * (Precision * Recall) / (Precision + Recall)
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Accuracy: TP / (TP + FP + FN)
    accuracy = len(true_positives) / (len(true_positives) + len(false_positives) + len(false_negatives)) if (len(true_positives) + len(false_positives) + len(false_negatives)) > 0 else 0

    return precision, recall, f1_score, accuracy

# Example usage:
actual_aarohanam = ['S','R2', 'G2', 'M1','D2']
extracted_aarohanam = ['S','R2','G2', 'M1', 'P', 'D2', 'N2']

precision, recall, f1_score, accuracy = calculate_metrics(actual_aarohanam, extracted_aarohanam)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1_score:.2f}")
print(f"Accuracy: {accuracy:.2f}")


Precision: 0.71
Recall: 1.00
F1 Score: 0.83
Accuracy: 0.71
