In [13]:
import soundcard as sc
import torchaudio as ta
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

from note_model import *

In [68]:
def mel_scale(data, SR=16000, NFFT=4095, N_MELS=256):
    """ converts spectrum to MEL spectrum without using librosa/torchaudio because they're broken on Jetson
        Note: only works with single channel
        Arguments:
            SR: sample rate
            NFFT: length of FFT (except *2-1 because of implementation?)
            N_MELS: size of output vector
    """
    low_freq_mel = 0
    high_freq_mel = (2595 * np.log10(1 + (SR / 2) / 700))  # Convert Hz to Mel
    mel_points = np.linspace(low_freq_mel, high_freq_mel, N_MELS + 2)  # Equally spaced in Mel scale
    hz_points = (700 * (10**(mel_points / 2595) - 1))  # Convert Mel to Hz
    bin = np.floor((NFFT + 1) * hz_points / SR)

    fbank = np.zeros((N_MELS, int(np.floor(NFFT / 2 + 1))))
    for m in range(1, N_MELS + 1):
        f_m_minus = int(bin[m - 1])   # left
        f_m = int(bin[m])             # center
        f_m_plus = int(bin[m + 1])    # right

        for k in range(f_m_minus, f_m):
            fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
        for k in range(f_m, f_m_plus):
            fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
    filter_banks = np.dot(data, fbank.T)
    filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)  # Numerical Stability
    filter_banks = 20 * np.log10(filter_banks)  # dB
    return filter_banks

In [10]:
mic = sc.get_microphone('CODEC')

In [21]:
sp.play(data, samplerate=44100, channels=2)

In [35]:
SR = 16000     # highest piano note is ~4k, shouldn't need more than double that range
NMELS = 256    # set by the model, don't change here

In [46]:
model = Audio2Midi(kernel_size=9)
model.load_state_dict(torch.load("./models/300_2e-05.pth"))
model.cuda()

<All keys matched successfully>

In [86]:
%%time
with mic.recorder(samplerate=SR, blocksize=32) as m, sp.player(samplerate=SR, channels=2, blocksize=32) as s:
    for j in range(SR//2048*2):
        data = m.record(numframes=2048)
        fframe = np.abs(np.fft.fft(data))
        lframe = mel_scale(np.abs(fframe[:,0]))
        rframe = mel_scale(np.abs(fframe[:,1]))
        lout = model(torch.from_numpy(lframe).float().unsqueeze(0).unsqueeze(0).cuda())
        rout = model(torch.from_numpy(rframe).float().unsqueeze(0).unsqueeze(0).cuda())
        if j==0:
            print(lout)
            print(rout)

tensor([0.4982, 0.0000, 0.2435, 0.3251, 0.0124, 0.1755, 0.0167, 0.0000, 0.0365,
        0.0000, 0.0434, 0.0658, 0.1043, 0.1373, 0.0000, 0.0640, 0.0000, 0.1840,
        0.0977, 0.0535, 0.0000, 0.0404, 0.2631, 0.2547, 0.0686, 0.3259, 0.1633,
        0.0000, 0.0635, 0.1513, 0.0000, 0.3172, 0.0000, 0.1414, 0.0662, 0.2442,
        0.0450, 0.3463, 0.1440, 0.0330, 0.1143, 0.1568, 0.1185, 0.2823, 0.3406,
        0.1819, 0.1308, 0.2384, 0.1711, 0.2428, 0.3526, 0.4191, 0.1851, 0.3796,
        0.4786, 0.3701, 0.1045, 0.0000, 0.1094, 0.1937, 0.0000, 0.3853, 0.3075,
        0.2059, 0.0000, 0.0000, 0.1132, 0.0000, 0.0000, 0.0000, 0.0000, 0.2994,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2259, 0.0000, 0.0000,
        0.2539, 0.1695, 0.0276, 0.1263, 0.0000, 0.0000, 0.2596],
       device='cuda:0', grad_fn=<SqueezeBackward1>)
tensor([0.2191, 0.2942, 0.1446, 0.3489, 0.2328, 0.2045, 0.3935, 0.6198, 0.2738,
        0.0000, 0.0000, 0.0034, 0.0970, 0.0000, 0.4029, 0.0103, 0.0000, 0.0885,
   