In [2]:
import soundcard as sc
import torchaudio as ta
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

from note_model import *
from zach_visualization import *

import IPython.display as ipd

In [3]:
def mel_scale(data, SR=16000, NFFT=4095, N_MELS=256):
    """ converts spectrum to MEL spectrum without using librosa/torchaudio because they're broken on Jetson
        Note: only works with single channel
        Arguments:
            SR: sample rate
            NFFT: length of FFT (except *2-1 because of implementation?)
            N_MELS: size of output vector
    """
    low_freq_mel = 0
    high_freq_mel = (2595 * np.log10(1 + (SR / 2) / 700))  # Convert Hz to Mel
    mel_points = np.linspace(low_freq_mel, high_freq_mel, N_MELS + 2)  # Equally spaced in Mel scale
    hz_points = (700 * (10**(mel_points / 2595) - 1))  # Convert Mel to Hz
    bin = np.floor((NFFT + 1) * hz_points / SR)

    fbank = np.zeros((N_MELS, int(np.floor(NFFT / 2 + 1))))
    for m in range(1, N_MELS + 1):
        f_m_minus = int(bin[m - 1])   # left
        f_m = int(bin[m])             # center
        f_m_plus = int(bin[m + 1])    # right

        for k in range(f_m_minus, f_m):
            fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
        for k in range(f_m, f_m_plus):
            fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
    filter_banks = np.dot(data, fbank.T)
    filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)  # Numerical Stability
    filter_banks = 20 * np.log10(filter_banks)  # dB
    return filter_banks

In [4]:
mic = sc.get_microphone('CODEC')

In [5]:
SR = 16000     # highest piano note is ~4k, shouldn't need more than double that range
NMELS = 256    # set by the model, don't change here

In [6]:
model = Audio2Midi(kernel_size=9)
model.load_state_dict(torch.load("./models/300_2e-05.pth"))
model.cuda()

Audio2Midi(
  (input): Conv1d(1, 64, kernel_size=(9,), stride=(1,), padding=(4,))
  (encoder): Sequential(
    (0): Sequential(
      (0): Conv1d(64, 128, kernel_size=(9,), stride=(2,), padding=(4,))
      (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): Tanh()
      (3): Dropout(p=0.1, inplace=False)
    )
    (1): Sequential(
      (0): Conv1d(128, 64, kernel_size=(9,), stride=(2,), padding=(4,))
      (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): Tanh()
      (3): Dropout(p=0.1, inplace=False)
    )
    (2): Sequential(
      (0): Conv1d(64, 32, kernel_size=(9,), stride=(2,), padding=(4,))
      (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): Tanh()
      (3): Dropout(p=0.1, inplace=False)
    )
    (3): Sequential(
      (0): Conv1d(32, 16, kernel_size=(9,), stride=(2,), padding=(4,))
      (1): BatchNorm1d(16, eps=1e-05, momentum=0.1, af

In [6]:
c = Cube(6)

Connecting to:  /dev/ttyACM1 
...
Connected to:  /dev/ttyACM1  !


In [7]:
def make_frame(l_vec, r_vec, genre, thresh):
    """
    Creates a frame for visualization stack out of left and right machine learning outputs
    Arguments:
        l_vec: velocity vector for left channel
        r_vec: velocity vector for right channel
        genre: genre to pass through to frame
        thresh: velocity value under which to ignore note
    """
    notes = []

    pans = torch.round(63.5 + torch.clamp(r_vec / l_vec, -63, 63))
    velocities = torch.max(torch.cat((l_vec.unsqueeze(1), r_vec.unsqueeze(1)), dim=1), dim=1).values
    for pitch in range(velocities.shape[0]):
        if 127 >= velocities[pitch].item() >= thresh:
            notes.append(Note(velocities[pitch].item(), pitch, pans[pitch].item()))

    frame = Frame(notes, genre)
    return frame

## Using ML

In [8]:
%%time
g = Genre('default')
mean_val = 0;
with mic.recorder(samplerate=SR, blocksize=32) as m:
    #for j in range(SR//2048*10):
    while True:
        data = m.record(numframes=2048)
        data /= np.max(np.abs(data))
        fframe = np.abs(np.fft.rfft(data))
        fframe = np.log(np.clip(fframe, 1e-5, None))
        lframe = mel_scale(np.abs(fframe[:,0]))
        rframe = mel_scale(np.abs(fframe[:,1]))
        mean_val = np.mean(lframe)
        lout = model(torch.from_numpy(lframe).float().unsqueeze(0).unsqueeze(0).cuda())
        rout = model(torch.from_numpy(rframe).float().unsqueeze(0).unsqueeze(0).cuda())
        f = make_frame(lout.detach(), rout.detach(), g, thresh=0.1)
        c.process_frame(f)
        #ipd.clear_output()
        c.display()
        #display(c.display_screen(0))

KeyboardInterrupt: 

## Using Raw Mels

In [8]:
%%time
g = Genre('default')
mean_val = 0;
with mic.recorder(samplerate=SR, blocksize=32) as m:
    #for j in range(SR//2048*30):
    while True:
        data = m.record(numframes=2048)
        data /= np.max(np.abs(data))
        fframe = np.abs(np.fft.rfft(data))
        fframe = np.log(np.clip(fframe, 1e-5, None))
        lframe = mel_scale(np.abs(fframe[:,0]))
        rframe = mel_scale(np.abs(fframe[:,1]))
        mean_val = np.mean(lframe)
        offset = 20
        lout = torch.from_numpy(lframe[offset:128+offset])
        rout = torch.from_numpy(rframe[offset:128+offset])
        f = make_frame(lout.detach(), rout.detach(), g, thresh=25)
        c.process_frame(f)
        #ipd.clear_output()
        c.display()
        #display(c.display_screen(0))

KeyboardInterrupt: 

In [10]:
mean_val

12.351149030321936

In [8]:
c.clear()

In [38]:
?torch.clamp

## Manual Testing

In [11]:
from random import randint

In [12]:
#notes = [Note(64,i,64) for i in range(0, 72+0)]
for i in range(12, 127):
    print(f"i: {i}")
    notes = [Note(127, i, 64)]
    f = Frame(notes, Genre('default'))

    c.clear()
    c.process_frame(f)
    c.display()
    time.sleep(0.1)

i: 12
i: 13
i: 14
i: 15
i: 16
i: 17
i: 18
i: 19
i: 20
i: 21
i: 22
i: 23
i: 24
i: 25
i: 26
i: 27
i: 28
i: 29
i: 30
i: 31
i: 32
i: 33
i: 34
i: 35
i: 36
i: 37
i: 38
i: 39
i: 40
i: 41
i: 42
i: 43
i: 44
i: 45
i: 46
i: 47
i: 48
i: 49
i: 50
i: 51
i: 52
i: 53
i: 54
i: 55
i: 56
i: 57
i: 58
i: 59
i: 60
i: 61
i: 62
i: 63
i: 64
i: 65
i: 66
i: 67
i: 68
i: 69
i: 70
i: 71
i: 72
i: 73
i: 74
i: 75
i: 76
i: 77
i: 78
i: 79
i: 80
i: 81
i: 82
i: 83
i: 84
i: 85
i: 86
i: 87
i: 88
i: 89
i: 90
i: 91
i: 92
i: 93
i: 94
i: 95
i: 96
i: 97
i: 98
i: 99
i: 100
i: 101
i: 102
i: 103
i: 104
i: 105
i: 106
i: 107
i: 108
i: 109
i: 110
i: 111
i: 112
i: 113
i: 114
i: 115
i: 116
i: 117
i: 118
i: 119
i: 120
i: 121
i: 122
i: 123
i: 124
i: 125
i: 126


In [14]:
c.arduino.testAnimation(c.size**3)