# Extract pitch information from audio

## Load Audio

In [142]:
import librosa
from IPython.display import Audio

y, sr = librosa.load('/home/willie/Projects/Mark_E_Markov/v2/audio/c_scale.wav')

Audio(data=y, rate=sr)

## Remove quiet frames

In [143]:
import numpy

y_silenced = numpy.where(y < -0.5, -1, y)
Audio(data=y_silenced, rate=sr)

## Low Pass Filter

In [144]:
import scipy

low_pass_filter = scipy.signal.butter(4, librosa.note_to_hz("A4"), btype="low", fs=sr, output="sos")
y_filtered = scipy.signal.sosfiltfilt(low_pass_filter, y)
Audio(data=y_filtered, rate=sr)


## Pitch Shift

In [145]:
y_shifted = librosa.effects.pitch_shift(y_filtered, sr=sr, n_steps=12)
Audio(data=y_shifted, rate=sr)

## f0

In [146]:
f0, voiced_flag, voiced_probs = librosa.pyin(
    y_shifted,
    sr=sr,
    fmin=librosa.note_to_hz("E2"),
    fmax=librosa.note_to_hz("A4"),
    fill_na=None,
)

# Return 0 for frames that are not voiced
f0_voiced = numpy.where(voiced_flag == False, 0, f0)
print(f0_voiced)

[  0.           0.           0.           0.           0.
   0.           0.           0.           0.           0.
   0.           0.           0.           0.           0.
   0.           0.           0.           0.           0.
   0.           0.           0.           0.           0.
   0.           0.         125.62905732 125.62905732 127.08879666
 127.82501457 127.82501457 127.82501457 128.56549734 129.31026969
 129.31026969 129.31026969 129.31026969 129.31026969 129.31026969
 129.31026969 129.31026969 129.31026969 129.31026969 129.31026969
 129.31026969 129.31026969 129.31026969 129.31026969 129.31026969
 129.31026969 129.31026969 129.31026969 129.31026969 129.31026969
 129.31026969 128.56549734 128.56549734 128.56549734 127.82501457
 127.82501457 127.82501457 140.20166901 145.14587019 145.98669166
 146.83238396 146.83238396 147.68297531 147.68297531 146.83238396
 145.98669166 146.83238396 146.83238396 146.83238396 146.83238396
 146.83238396 146.83238396 146.83238396 146.832383

In [147]:
def quarter_step_above(freq):
    return freq * 2 ** (1 / 24)


def quarter_step_below(freq):
    return freq / 2 ** (1 / 24)

In [None]:
import numpy

MIN_NOTE_LENGTH = 1
notes = []
current_note_frames = []

for frame in f0:
    current_note = None
    if len(current_note_frames) > 0 and None not in current_note_frames:
        current_note = sum(current_note_frames) / len(current_note_frames)
    
    # frame is a note
    if frame > 0:
        # lower frame by an octave
        frame = frame / 2
        # current_note is a float
        if isinstance(current_note, numpy.floating):
            upper_limit = quarter_step_above(current_note)
            lower_limit = quarter_step_below(current_note)

            if frame >= lower_limit and frame <= upper_limit:
                current_note_frames.append(frame)
            else:
                if len(current_note_frames) > MIN_NOTE_LENGTH:
                    note = librosa.hz_to_note(current_note)
                    notes.append(note)
                current_note_frames = [frame]
        else:
            if len(current_note_frames) > MIN_NOTE_LENGTH:
                notes.append(None)
            current_note_frames = [frame]
    else:
        if current_note is float:
            if len(current_note_frames) > MIN_NOTE_LENGTH:
                note = librosa.hz_to_note(current_note)
                notes.append(note)
            current_note_frames = [None]
        else:
            current_note_frames.append(None)

# cleanup
last_note = None
if len(current_note_frames) > 0 and None not in current_note_frames:
    last_note = sum(current_note_frames) / len(current_note_frames)

if last_note is not None and len(current_note_frames) > MIN_NOTE_LENGTH:
    note = librosa.hz_to_note(last_note)
    notes.append(note)
elif len(current_note_frames) > MIN_NOTE_LENGTH:
    notes.append(None)

print(notes)
            


['E1', 'C2', 'D2', 'C2', 'E2', 'F♯2', 'F2', 'G2', 'A2', 'B2', 'C3', 'C2']


In [149]:
def frequency_to_note(frequency):
    if frequency == 0:
        return None
    else:
        return librosa.hz_to_note(frequency)
# filtered_voiced_f0 = [f for f in f0_voiced if f > 0]
notes = list(map(frequency_to_note, f0_voiced))
counted_notes = []
current_note_pointer = notes[0]
current_note_count = 1
for note in notes[1:]:
    if note == current_note_pointer:
        current_note_count += 1
    else:
        counted_notes.append((current_note_pointer, current_note_count))
        current_note_pointer = note
        current_note_count = 1
print(counted_notes)
limited_notes = [n[0] for n in counted_notes if n[1] > 20]
print(limited_notes)


[(None, 27), ('B2', 2), ('C3', 33), ('C♯3', 1), ('D3', 24), ('D♯3', 2), ('C3', 2), (None, 1), ('F3', 1), ('E3', 25), (None, 2), ('G3', 1), ('F♯3', 2), ('F3', 27), (None, 3), ('G3', 28), ('G♯3', 1), ('A3', 32), ('A♯3', 1), ('B3', 32), ('C4', 101), (None, 2)]
[None, 'C3', 'D3', 'E3', 'F3', 'G3', 'A3', 'B3', 'C4']
