In [None]:
##Import Libraries
#For Dataframe & Excel handling
import openpyxl
import pandas as pd

#For Array manipulations & Audio Processing
import numpy as np

#For Audio Processing
import librosa
import soundfile as sf

#For File Handling
import os

#Custom Helper Functions
from helpers import *

## Sruthi Extraction & Standardization Pipeline

In [None]:
clipped_df_std_sruthi_merged = run_sruthi_standardization_pipeline()
# clipped_df_std_sruthi_merged.head(50)

## Sruthi Extraction & Standardiization - Pipeline Individual Executions

In [None]:
audio_path = get_audio_file_paths()
audio_arrays, sampling_rates, original_sruthi_frequency_hz, original_sruthi_frequency_midi, original_sruthi_note = get_sruthi_info(audio_path)
audios_standardized_sruthi, std_sruthi_freq_hz, std_sruthi_freq_midi, std_sruthi_note = standardize_sruthi(audio_arrays, sampling_rates, original_sruthi_frequency_hz)
# written_paths = save_standardized_audios(audio_path, audios_standardized_sruthi, sampling_rates, out_root="sruthi_standardized_audios", in_root="dataset", suffix="_standard_sruthi")
base_dataframe_std_sruthi = create_base_dataframe(audio_path, original_sruthi_note, original_sruthi_frequency_hz, original_sruthi_frequency_midi, std_sruthi_note, std_sruthi_freq_hz, std_sruthi_freq_midi, sampling_rates)
clipped_df = split_audio_into_clips(audio_path, audios_standardized_sruthi, sampling_rates, clip_duration_sec=30)
clipped_df_std_sruthi_merged = merge_details_to_clip(clipped_df, base_dataframe_std_sruthi)



In [8]:
print(f"Length of audio_paths: {len(audio_path)}")
print(f"Length of audio_arrays: {len(audio_arrays)}")
print(f"Length of sampling_rates: {len(sampling_rates)}")
print(f"Length of original_sruthi_frequency_hz: {len(original_sruthi_frequency_hz)}")
print(f"Length of original_sruthi_frequency_midi: {len(original_sruthi_frequency_midi)}")
print(f"Length of original_sruthi_note: {len(original_sruthi_note)}")
print(f"Length of audios_standardized_sruthi: {len(audios_standardized_sruthi)}")
# print(f"Length of written_paths: {len(written_paths)}")
print(f"Length of base_dataframe_std_sruthi: {len(base_dataframe_std_sruthi)}")

Length of audio_paths: 41
Length of audio_arrays: 41
Length of sampling_rates: 41
Length of original_sruthi_frequency_hz: 41
Length of original_sruthi_frequency_midi: 41
Length of original_sruthi_note: 41
Length of audios_standardized_sruthi: 41
Length of base_dataframe_std_sruthi: 41


## Initial Prototype for Sruthi identification & Standardization

### Sruthi Identification

In [None]:
# Step 1: 
# Load the audio file and identify the default sampling rate


# Load the audio file with the original sampling rate
y, sr = librosa.load(audio_path, sr=None)

# Print the default sampling rate
print(f"Default Sampling Rate: {sr}")
print(f"Audio Data Type: {type(y)}")
print(f"Audio Data Shape: {y.shape}")
print(f"Audio Duration (seconds): {librosa.get_duration(y=y, sr=sr)}")
print(f"First 10 samples of audio data: {y[:10]}")
print(y)

Default Sampling Rate: 44100
Audio Data Type: <class 'numpy.ndarray'>
Audio Data Shape: (44756352,)
Audio Duration (seconds): 1014.8832653061224
First 10 samples of audio data: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]


In [3]:
len(y)

44756352

In [4]:
# Step 2: Identify the Sruthi (Fundamental Frequency)
# Use Harmonic Product Spectrum (HPS) or piptrack to detect pitch
# piptrack returns an array of frequencies and their corresponding magnitudes
pitches, magnitudes = librosa.core.piptrack(y=y, sr=sr)

print(f"pitches:{pitches}")
print(f"magnitudes:{magnitudes}")
print(f"pitches shape: {pitches.shape}")
print(f"magnitudes shape: {magnitudes.shape}")

pitches:[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
magnitudes:[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
pitches shape: (1025, 87415)
magnitudes shape: (1025, 87415)


In [5]:
# Get the index of the maximum magnitude
index = magnitudes.argmax()

# Convert the flattened index to 2D indices (row, col)
row, col = np.unravel_index(index, magnitudes.shape)

# Get the fundamental frequency (F0) from the corresponding pitch
f0 = pitches[row, col]

print(f"Identified Fundamental Frequency (F0): {f0} Hz")

Identified Fundamental Frequency (F0): 606.0250854492188 Hz


In [6]:
# Optionally, convert Hz to MIDI note (e.g., for easier pitch comparison)
tonic_note_midi = librosa.hz_to_midi(f0)
print(f"Detected Tonic Note (MIDI): {tonic_note_midi}")

Detected Tonic Note (MIDI): 74.54248760640525


In [7]:
sruthi_note = librosa.midi_to_note(tonic_note_midi)

print(f"Fundamental Frequency (F0): {f0} Hz")
print(f"Corresponding MIDI Note: {tonic_note_midi}")
print(f"Musical Pitch Note: {sruthi_note}")

Fundamental Frequency (F0): 606.0250854492188 Hz
Corresponding MIDI Note: 74.54248760640525
Musical Pitch Note: Dâ™¯5


### Sruthi Standardization

In [8]:
# Step 3: Standardizing the Sruthi to C4
desired_sruthi_freq = 261.63  # Frequency of C4 in Hz

# Calculate the pitch shift in semitones
# Convert both frequencies to MIDI notes and find the difference in semitones
pitch_shift_steps = librosa.hz_to_midi(desired_sruthi_freq) - librosa.hz_to_midi(f0)

# Apply pitch shift to the audio (shifting F0 to the desired tonic C4)
y_shifted = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch_shift_steps)

# Print pitch shift steps
print(f"Pitch shift steps: {pitch_shift_steps} semitones")



Pitch shift steps: -14.542194155055263 semitones


### Writing Audio to directory

In [6]:
# Save the shifted audio to a new file using soundfile
sf.write('ena_gaanu_shifted_audio_to_C4.wav', y_shifted, sr)

NameError: name 'y_shifted' is not defined

### Splitting audio

In [10]:
# Function to split the audio into 30-second clips
def split_audio(audio, sr, clip_duration=30):
    samples_per_clip = clip_duration * sr
    clips = []
    
    # Split the audio into clips
    for start in range(0, len(audio), samples_per_clip):
        end = start + samples_per_clip
        clip = audio[start:end]  # Extract the clip
        if len(clip) == samples_per_clip:  # Only add full-length clips
            clips.append(clip)
    
    return clips

In [12]:
# Get the 30-second clips
clips = split_audio(y_shifted, sr)