In [3]:
import numpy as np
import matplotlib.pyplot as plt
import essentia
import IPython
from pylab import plot, show, figure, imshow
from essentia.standard import *
import essentia.streaming as ess

In [4]:
fnames = ['145792817.wav', '155943863.wav','165489427.mp4','badboy.mp3','201804232330421545.wav', 'boomboombang.wav']
#audio = MonoLoader(filename='/Users/minsubsim/work/sms-tools/sounds/piano.wav')()
fname = fnames[1]
fullpath = 'samples/%s' % (fname)
sample_audio = MonoLoader(filename=fullpath)()
# Phase 1: compute the onset detection function
# The OnsetDetection algorithm provides various onset detection functions. Let's use two of them.

frameSize = 2**11
hopSize = 2**9


w = Windowing(type='blackmanharris62')
fft = FFT() # this gives us a complex FFT
c2p = CartesianToPolar() # and this turns it into a pair (magnitude, phase)
pool = essentia.Pool()

spr_anal = SprModelAnal(fftSize = frameSize, hopSize = hopSize, maxFrequency = 50000)
spr_synth = SprModelSynth(fftSize = frameSize, hopSize = int(hopSize*1))

od1 = OnsetDetection(method='hfc')
od2 = OnsetDetection(method='complex')

In [5]:
for frame in FrameGenerator(sample_audio, frameSize=frameSize, hopSize=hopSize):
    mag, phase, = c2p(fft(w(frame)))
    pool.add('features.hfc', od1(mag, phase))
    pool.add('features.complex', od2(mag, phase))
    
# Phase 2: compute the actual onsets locations
onsets = Onsets(delay=10)

onsets_hfc = onsets(# this algo expects a matrix, not a vector
                    essentia.array([ pool['features.hfc'] ]),

                    # you need to specify weights, but as there is only a single
                    # function, it doesn't actually matter which weight you give it
                    [ 1 ])

onsets_complex = onsets(essentia.array([ pool['features.complex'] ]), [ 1 ])

onsets_f = np.round(onsets_hfc * 44100.0)
num_onsets = len(onsets_f)
onsets_f = np.append(onsets_f, len(sample_audio))

In [6]:
frameSize = 2**11
hopSize = 2**9
for i in range(num_onsets):
    input_audio = ess.VectorInput(sample_audio[int(onsets_f[i]):int(onsets_f[i+1])])
    framecutter = ess.FrameCutter(frameSize = frameSize, hopSize = hopSize, silentFrames='noise')
    spr_anal = ess.SprModelAnal(fftSize = frameSize, hopSize = hopSize, maxFrequency = 50000)
    spr_synth = ess.SprModelSynth(fftSize = frameSize, hopSize = int(hopSize*1))
    windowing = ess.Windowing(type='blackmanharris62')

    spectrum = ess.Spectrum()
    spectralpeaks = ess.SpectralPeaks(orderBy='magnitude',
                                      magnitudeThreshold=0.00001,
                                      minFrequency=20,
                                      maxFrequency=3500,
                                      maxPeaks=60)

    hpcp_key = ess.HPCP(size=36, # we will need higher resolution for Key estimation
                        referenceFrequency=440, # assume tuning frequency is 44100.
                        bandPreset=False,
                        minFrequency=20,
                        maxFrequency=3500,
                        weightType='cosine',
                        nonLinear=False,
                        windowSize=1.)

    keymodel = ess.Key(profileType='edma', # Use profile for electronic music
                       numHarmonics=4,
                       pcpSize=36,
                       slope=0.6,
                       usePolyphony=True,
                       useThreeChords=True)
    pool = essentia.Pool()

    
    input_audio.data >> framecutter.signal
    framecutter.frame >> windowing.frame >> spectrum.frame
    spectrum.spectrum >> spectralpeaks.spectrum
    spectralpeaks.magnitudes >> hpcp_key.magnitudes
    spectralpeaks.frequencies >> hpcp_key.frequencies
    hpcp_key.hpcp >> keymodel.pcp
    keymodel.key >> (pool, 'tonal.key_key')
    keymodel.scale >> (pool, 'tonal.key_scale')
    keymodel.strength >> (pool, 'tonal.key_strength')

    # Run streaming network
    essentia.run(input_audio)
    print(pool['tonal.key_key'])
    print(pool['tonal.key_scale'])
    print(pool['tonal.key_strength'])

G
minor
0.6454615592956543
D
minor
0.35072171688079834
A#
major
0.6340144276618958
B
minor
0.5540128350257874
A
major
0.4152987003326416
F#
minor
0.6670655012130737
G
major
0.378895103931427
F#
minor
0.352996289730072
F
minor
0.555366575717926
E
major
0.5379486680030823
D
minor
0.44043442606925964
D
major
0.37589573860168457
E
minor
0.6504807472229004
F
minor
0.44216156005859375
B
minor
0.6058656573295593
B
major
0.5220363140106201
D#
minor
0.7641458511352539
E
minor
0.5616748332977295
C#
minor
0.5935976505279541
F
minor
0.6047481894493103
F#
minor
0.27499866485595703
E
minor
0.7176535129547119
B
minor
0.3761984705924988
G#
minor
0.44745978713035583
E
major
0.399375319480896
B
minor
0.5074400901794434
D#
minor
0.49154213070869446
E
major
0.4620727002620697
D
minor
0.5699975490570068
C#
minor
0.6738351583480835
A
major
0.5094552040100098
B
major
0.42019063234329224
D#
minor
0.6974368095397949
D#
minor
0.6100674867630005
F#
major
0.6278017163276672
A#
major
0.47562575340270996
D#
minor
0

In [7]:

anal_res = []
key_res = []
key_table = ['A','A#','B','C',
             'C#','D','D#','E',
             'F','F#','G','G#']

spr_anal = SprModelAnal(fftSize = frameSize, hopSize = hopSize, maxFrequency = 50000)
spr_synth = SprModelSynth(fftSize = frameSize, hopSize = int(hopSize*1))
windowing = Windowing(type='blackmanharris62')
spectrum = Spectrum()
spectralpeaks = SpectralPeaks(orderBy='magnitude',
                                  magnitudeThreshold=0.00001,
                                  minFrequency=20,
                                  maxFrequency=3500,
                                  maxPeaks=60)

hpcp_key = HPCP(size=36, # we will need higher resolution for Key estimation
                    referenceFrequency=440, # assume tuning frequency is 44100.
                    bandPreset=False,
                    minFrequency=20,
                    maxFrequency=3500,
                    weightType='cosine',
                    nonLinear=False,
                    windowSize=1.)

keymodel = Key(profileType='edma', # Use profile for electronic music
                   numHarmonics=4,
                   pcpSize=36,
                   slope=0.6,
                   usePolyphony=True,
                   useThreeChords=True)

for frame in FrameGenerator(sample_audio, frameSize=frameSize, hopSize=hopSize):
    freq, mag = spectralpeaks(spectrum(windowing(frame)))
    key, scale, strength, r12 = keymodel(hpcp_key(freq, mag))
    anal_res.append(spr_anal(frame))
    key_res.append((key_table.index(key), scale == 'major', strength, r12))


In [8]:

def pitch_factor(n):
    return 2**(1.0 * n / 12.0)
target_key = 4


output = []
for f in range(len(anal_res)):
    freq, mag, phase, resi = anal_res[f]
    key, scale, strength, r12 = key_res[f]
    
    if abs(target_key - key) < 6:
        pf = pitch_factor(target_key-key)
    else:
        shift = target_key - key
        if shift < 0:
            shift += 12
        else:
            shift -= 12
        pf = pitch_factor(shift)
    
    out_f, out_fs, out_fr = spr_synth(mag, freq*pf, phase, resi)
    output.append(out_fs)
    
result_audio = np.asarray(output).flatten()
print(len(sample_audio))
print(len(anal_res))
print(result_audio.size)
MonoWriter(filename='outputs/test_%s' % (fname))(result_audio)


2051257
4008
2052096


In [9]:
import IPython
IPython.display.Audio('outputs/test_%s' % (fname))