# Basic Note Detection

Author: Adam Wiraszka  
Date Created: 2024-03-14  

Simple python notebook that can be used to detect and visualize notes in a short audio file (mostly suited for monophonic audio).  

Includes:
- Converting Frequency to approximate notes.  
- Ploting frequency spectrum over time at standard time intervals (Hanning Window).   
- Use ffmpeg to create mp4 video showing frequency spectrum over time with labeled notes.  

Code adapted from the following sources:  
https://github.com/jeffheaton/present/blob/master/youtube/video/fft-frequency.ipynb\  
https://newt.phys.unsw.edu.au/jw/notes.html  
https://www.youtube.com/watch?v=rj9NOiFLxWA

**Set path and reset image dump folder**

In [1]:
from pathlib import Path
import os, glob

PATH = Path.cwd()

# Check if content directory exists, create one if not.
f = Path.cwd().joinpath("content")
if not f.is_dir(): 
    f.mkdir()

# Delete image files currently in directory.
for file in os.scandir(f):
    os.remove(file.path)

**Set constants**

In [2]:
# Config
FPS = 30
FFT_WINDOW_SECONDS = 0.25 # how many seconds of audio make up an FFT window

# Note range to display
FREQ_MIN = 10
FREQ_MAX = 1000

# Notes to display
TOP_NOTES = 3

# Names of the notes
NOTE_NAMES = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']

# Output size. Generally use SCALE for higher res, unless you need a non-standard aspect ratio.
RESOLUTION = (1920, 1080)
SCALE = 1 # 0.5=QHD(960x540), 1=HD(1920x1080), 2=4K(3840x2160)

**Choose Audio File**

In [3]:
import os

AUDIO_FILE = os.path.join(PATH,'Hey_You_1.wav')

**Load audio file and set up x domain (time)** 

In [4]:
import matplotlib.pyplot as plt
from scipy.fftpack import fft
from scipy.io import wavfile


fs, data = wavfile.read(os.path.join(PATH,AUDIO_FILE)) # load data into array
audio = data.T[0] # this is a two channel soundtrack, get the first track
FRAME_STEP = (fs / FPS) # audio samples per video frame
FFT_WINDOW_SIZE = int(fs * FFT_WINDOW_SECONDS)
AUDIO_LENGTH = len(audio)/fs

print(AUDIO_LENGTH)

16.653854875283447


  fs, data = wavfile.read(os.path.join(PATH,AUDIO_FILE)) # load data into array


**Several Utility Functions**  
- Plot Frequency Spectrum  
- Extract Samples  
- Find Top Notes  

In [5]:
import plotly.graph_objects as go

def plot_fft(p, xf, fs, notes, dimensions=(960,540)):
    layout = go.Layout(
      title="frequency spectrum",
      autosize=False,
      width=dimensions[0],
      height=dimensions[1],
      xaxis_title="Frequency (note)",
      yaxis_title="Magnitude",
      font={'size' : 24}
    )

    fig = go.Figure(layout=layout,
                  layout_xaxis_range=[FREQ_MIN,FREQ_MAX],
                  layout_yaxis_range=[0,1]
                  )
  
    fig.add_trace(go.Scatter(
      x = xf,
      y = p))
  
    for note in notes:
        fig.add_annotation(x=note[0]+10, y=note[2],
            text=note[1],
            font = {'size' : 48},
            showarrow=False)
    return fig

def extract_sample(audio, frame_number):
    end = frame_number * FRAME_OFFSET
    begin = int(end - FFT_WINDOW_SIZE)

    if end == 0:
        # We have no audio yet, return all zeros (very beginning)
        return np.zeros((np.abs(begin)),dtype=float)
    elif begin<0:
        # We have some audio, padd with zeros
        return np.concatenate([np.zeros((np.abs(begin)),dtype=float),audio[0:end]])
    else:
        # Usually this happens, return the next sample
        return audio[begin:end]

def find_top_notes(fft,num):
    if np.max(fft.real)<0.001:
        return []

    lst = [x for x in enumerate(fft.real)]
    lst = sorted(lst, key=lambda x: x[1],reverse=True)

    idx = 0
    found = []
    found_note = set()
    while( (idx<len(lst)) and (len(found)<num) ):
        f = xf[lst[idx][0]]
        y = lst[idx][1]
        n = freq_to_number(f)
        n0 = int(round(n))
        name = note_name(n0)

        if name not in found_note:
            found_note.add(name)
            s = [f,note_name(n0),y]
            found.append(s)
        idx += 1
    
    return found

**Convert Frequency to Note**

In [6]:
import numpy as np

def freq_to_number(f): return 69 + 12*np.log2(f/440.0)
def number_to_freq(n): return 440 * 2.0**((n-69)/12.0)
def note_name(n): return NOTE_NAMES[n % 12] + str(int(n/12 - 1))

**Hanning Window Function**

In [7]:
# Hanning window function
window = 0.5 * (1 - np.cos(np.linspace(0, 2*np.pi, FFT_WINDOW_SIZE, False)))

xf = np.fft.rfftfreq(FFT_WINDOW_SIZE, 1/fs)
FRAME_COUNT = int(AUDIO_LENGTH*FPS)
FRAME_OFFSET = int(len(audio)/FRAME_COUNT)

print(f"Frame Count: {FRAME_COUNT}")
print(f"Frame Offset: {FRAME_OFFSET}")

Frame Count: 499
Frame Offset: 1471


**Pass 1 - Find out the maximum amplitude so we can scale**

In [8]:
mx = 0
for frame_number in range(FRAME_COUNT):
    sample = extract_sample(audio, frame_number)

    fft = np.fft.rfft(sample * window)
    fft = np.abs(fft).real 
    mx = max(np.max(fft),mx)

print(f"Max amplitude: {mx}")

Max amplitude: 585466845891.2194


**Pass 2 - Produce the Animation**

In [9]:
import tqdm

for frame_number in tqdm.tqdm(range(FRAME_COUNT)):
    p = Path.cwd()
    sample = extract_sample(audio, frame_number)

    fft = np.fft.rfft(sample * window)
    fft = np.abs(fft) / mx 
     
    s = find_top_notes(fft,TOP_NOTES)

    fig = plot_fft(fft.real,xf,fs,s,RESOLUTION)
    
    filename = f.joinpath(f"frame{frame_number}.png")
    
    fig.write_image(filename, format='png', engine='kaleido')

100%|████████████████████████████████████████████████████████████████████████████████| 499/499 [01:42<00:00,  4.85it/s]


**Use ffmpeg to combine the input audio WAV and the individual frame images into a MP4 video.**

In [10]:
import ffmpeg

In [11]:
!ffmpeg -y -r {FPS} -f image2 -s 1920x1080 -i C:/Users/Adam/Desktop/Projects/audio_processing/content/frame%d.png -i {AUDIO_FILE} -c:v libx264 -pix_fmt yuv420p movie.mp4

ffmpeg version 4.3.1-2020-11-19-essentials_build-www.gyan.dev Copyright (c) 2000-2020 the FFmpeg developers
  built with gcc 10.2.0 (Rev5, Built by MSYS2 project)
  configuration: --enable-gpl --enable-version3 --enable-static --disable-w32threads --disable-autodetect --enable-fontconfig --enable-iconv --enable-gnutls --enable-libxml2 --enable-gmp --enable-lzma --enable-zlib --enable-libsrt --enable-libssh --enable-libzmq --enable-avisynth --enable-sdl2 --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxvid --enable-libaom --enable-libopenjpeg --enable-libvpx --enable-libass --enable-libfreetype --enable-libfribidi --enable-libvidstab --enable-libvmaf --enable-libzimg --enable-amf --enable-cuda-llvm --enable-cuvid --enable-ffnvcodec --enable-nvdec --enable-nvenc --enable-d3d11va --enable-dxva2 --enable-libmfx --enable-libgme --enable-libopenmpt --enable-libopencore-amrwb --enable-libmp3lame --enable-libtheora --enable-libvo-amrwbenc --enable-libgsm --enable-libopencore-amr