In [1]:
import IPython.display as ipd

import numpy as np
import torch


# import stft_64_pad_0 as stft
import stft_64 as stft
from audio_processing import griffin_lim

from scipy.io.wavfile import read
import time
import newton_method_solver

In [2]:
hop_length = 2
win_length = 4*hop_length
channels = hop_length*2+1
wav_length = hop_length*7

audio_origin = torch.rand((1,wav_length), dtype=torch.float64)
print(audio_origin.shape)
print(audio_origin[0,:10])

torch.Size([1, 14])
tensor([0.6561, 0.9268, 0.0702, 0.1676, 0.3579, 0.0400, 0.5828, 0.8942, 0.9843,
        0.8540], dtype=torch.float64)


In [3]:
# stft_fn = stft.STFT(filter_length=4, hop_length=1, win_length=4,
#                     window='hann')

stft_fn = stft.STFT(filter_length=win_length, hop_length=hop_length, win_length=win_length,
                    window=None)


def compare(a,b):
    return torch.mean(torch.abs(a-b)), torch.mean((a-b)*(a-b))

def compare_L1(ori,gen):
    return torch.mean(torch.abs(ori-gen)/torch.abs(ori))


def compare_L2(a,b):
    return torch.sum(torch.abs(a-b)), torch.sum((a-b)*(a-b))

In [4]:
magnitude, phase_origin = stft_fn.transform(audio_origin)

forward_basis = stft_fn.forward_basis
print(forward_basis.shape)
start_frame = 0
M_Rc = torch.sum(forward_basis[:channels,0,:] * audio_origin[:,start_frame: start_frame+win_length], dim =1)
M_Ic = torch.sum(forward_basis[channels:,0,:] * audio_origin[:,start_frame: start_frame+win_length], dim =1)
M_c_square = torch.sqrt(M_Rc**2+M_Ic**2)
print(M_c_square.shape)
print(magnitude[0,:10,start_frame+2])

torch.Size([10, 1, 8])
torch.Size([5])
tensor([3.6956, 1.4934, 0.3733, 1.0496, 0.3617], dtype=torch.float64)


In [5]:
initial_guess = torch.rand((1, 7*hop_length))

In [10]:


print('hop size', hop_length)
newton_method = newton_method_solver.hop_7_solver(forward_basis, hop_length, channels, win_length)
ans = newton_method.solve(magnitude[:,:,2:6], initial_guess, n_iters=20)
print('\n')
print('ans   ', ans[0, :10])
print('origin', audio_origin[0, :10])
print('error       ', newton_method.func(ans)[:10])
print('error origin', newton_method.func(audio_origin)[:10])
print('ans part    3', ans[0,3*hop_length:4*hop_length])
print('origin part 3', audio_origin[0, 3*hop_length:4*hop_length])

hop size 2
Iter 19/20: Used times: 0.20

ans    tensor([ 0.7494,  0.8226,  0.2020,  0.0589,  0.4304, -0.0018,  0.5331,  0.8622,
         0.9449,  0.9341], dtype=torch.float64)
origin tensor([0.6561, 0.9268, 0.0702, 0.1676, 0.3579, 0.0400, 0.5828, 0.8942, 0.9843,
        0.8540], dtype=torch.float64)
error        tensor([-0.0388,  0.0129,  0.0473, -0.0214,  0.0102, -0.0135, -0.0006,  0.0040,
         0.0825, -0.0690], dtype=torch.float64)
error origin tensor([ 0.0000e+00,  0.0000e+00,  8.8818e-16,  8.8818e-16,  0.0000e+00,
         0.0000e+00, -2.2204e-16,  2.2204e-16,  0.0000e+00, -1.1102e-16],
       dtype=torch.float64)
ans part    3 tensor([0.5331, 0.8622], dtype=torch.float64)
origin part 3 tensor([0.5828, 0.8942], dtype=torch.float64)


In [11]:

ans = newton_method.solve(magnitude[:,:,2:6], audio_origin + 0.1*torch.rand((1, 7*hop_length)), n_iters=10)
print('ans   ', ans[0, :10])
print('origin', audio_origin[0, :10])
print('error       ', newton_method.func(ans)[:10])
print('error origin', newton_method.func(audio_origin)[:10])

Iter 9/10: Used times: 0.07ans    tensor([1.0189, 0.5585, 0.0404, 0.3709, 0.1276, 0.1135, 0.7939, 0.6844, 1.0652,
        0.6971], dtype=torch.float64)
origin tensor([0.6561, 0.9268, 0.0702, 0.1676, 0.3579, 0.0400, 0.5828, 0.8942, 0.9843,
        0.8540], dtype=torch.float64)
error        tensor([ 0.0126, -0.0578,  0.0024,  0.0428,  0.0797, -0.1043,  0.0879, -0.0630,
         0.1210, -0.2754], dtype=torch.float64)
error origin tensor([ 0.0000e+00,  0.0000e+00,  8.8818e-16,  8.8818e-16,  0.0000e+00,
         0.0000e+00, -2.2204e-16,  2.2204e-16,  0.0000e+00, -1.1102e-16],
       dtype=torch.float64)


#### Test

In [7]:
def load_wav_to_torch(full_path):
    sampling_rate, data = read(full_path)
    return torch.DoubleTensor(data.astype(np.float32)), sampling_rate

audio_origin, sampling_rate = load_wav_to_torch('demo.wav')

magnitude, phase_origin = stft_fn.transform(audio_origin.unsqueeze(0))
print(magnitude.shape)
# print(magnitude[0,:5,:2])
# magnitude, phase_origin = stft_fn.transform(audio_origin.unsqueeze(0)*20)
# print(magnitude.shape)
# print(magnitude[0,:5,:2]/20)

torch.Size([1, 33, 6161])


In [13]:
def griffin_lim(magnitudes, stft_fn, n_iters=30):
    
    angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
    angles = angles.astype(np.float64)
    angles = torch.autograd.Variable(torch.from_numpy(angles))
    signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
    # f=open('griffin lim.txt','a')
    for i in range(n_iters):
        if (i+1)%5==0:
            # f.write('%d/%d:%.4f'%(i,n_iters,compare_L1(magnitude,MAG)))
            a1,a2 = compare(signal, audio_origin)
            print('%d/%d:%.4f, %.4f'%(i,n_iters,a1,a2))
        MAG, angles = stft_fn.transform(signal)
        signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
    return angles

phase_griffin_lim =  griffin_lim(magnitude, stft_fn, n_iters=5)
print(phase_griffin_lim.shape)
audio_griffin_lim = stft_fn.inverse(magnitude, phase_griffin_lim)
print(audio_griffin_lim.shape)

4/5:0.0596, 0.0110
torch.Size([1, 33, 6161])
torch.Size([1, 1, 98560])


In [26]:
START = 20

ground_magnitude = magnitude[:,:,START+2:START+6]
ground_audio_origin = audio_origin.unsqueeze(0)[:, START * hop_length : (START+7) * hop_length]
initial_guess_audio = audio_griffin_lim.squeeze(0)[:, START * hop_length : (START+7) * hop_length]
print(initial_guess_audio[0,:10])

tensor([ 1.7802e-05, -3.3379e-03, -4.2128e-03,  4.8132e-03,  4.7193e-03,
        -2.9714e-03, -6.9590e-03,  4.2493e-03,  2.2001e-03, -3.5663e-03],
       dtype=torch.float64)


In [27]:
print(torch.mean(torch.abs(ground_magnitude)))
normalize_coefficient = max(1/torch.mean(torch.abs(ground_magnitude)), 1)
print(normalize_coefficient)

tensor(0.0206, dtype=torch.float64)
tensor(48.5575, dtype=torch.float64)


In [19]:

print('hop size', hop_length)
newton_method = newton_method_solver.hop_7_solver(forward_basis, hop_length, channels, win_length)
ans = newton_method.solve(ground_magnitude*normalize_coefficient, initial_guess_audio*normalize_coefficient, n_iters=5)
ans = ans/normalize_coefficient
print('\n')
print('ans   ', ans[0, :10])
print('origin', ground_audio_origin[0, :10])
print('error       ', newton_method.test(ans, ground_magnitude)[:10])
print('error origin', newton_method.test(ground_audio_origin, ground_magnitude)[:10])
print('ans    part 3', ans[0,3*hop_length:4*hop_length])
print('origin part 3', ground_audio_origin[0, 3*hop_length:4*hop_length])

hop size 16
Iter 4/5: Used times: 0.74

ans    tensor([-0.0121,  0.0050,  0.0025,  0.0092, -0.0033, -0.0117,  0.0034,  0.0084,
         0.0004, -0.0062], dtype=torch.float64)
origin tensor([-0.0032, -0.0005,  0.0052,  0.0018, -0.0036, -0.0017,  0.0049, -0.0013,
        -0.0011,  0.0027], dtype=torch.float64)
error        tensor([ 5.6638e-04, -1.0644e-05,  8.0277e-04, -1.3585e-03,  2.2861e-03,
        -1.3595e-03,  3.2977e-03,  1.0826e-03,  2.0253e-03,  1.7724e-03],
       dtype=torch.float64)
error origin tensor([ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00, -3.4694e-18,
         3.4694e-18,  2.1684e-18, -3.4694e-18, -3.4694e-18,  2.6021e-18],
       dtype=torch.float64)
ans    part 3 tensor([-0.0050,  0.0094,  0.0011, -0.0132,  0.0011,  0.0079, -0.0031, -0.0063,
        -0.0017,  0.0048,  0.0039, -0.0059, -0.0086,  0.0060,  0.0142, -0.0096],
       dtype=torch.float64)
origin part 3 tensor([ 2.1603e-03,  6.6019e-03, -2.1932e-03, -5.6342e-03,  2.8682e-03,
         4.9766e-03,  5.

In [28]:
hop_iter =0
start_frame = hop_iter * hop_length
M_Rc = torch.sum(forward_basis[:channels,0,:] * ans[:,start_frame: start_frame + win_length], dim =1)
M_Ic = torch.sum(forward_basis[channels:,0,:] * ans[:,start_frame: start_frame + win_length], dim =1)
M_c_square = torch.sqrt(M_Rc**2+M_Ic**2) - (ground_magnitude)[0,:,hop_iter]
print(M_Rc)
print(M_Ic)
print(M_c_square)
M_Rc = torch.sum(forward_basis[:channels,0,:] * ground_audio_origin[:,start_frame: start_frame + win_length], dim =1)
M_Ic = torch.sum(forward_basis[channels:,0,:] * ground_audio_origin[:,start_frame: start_frame + win_length], dim =1)
M_c_square = torch.sqrt(M_Rc**2+M_Ic**2) - (ground_magnitude)[0,:,hop_iter]
print(M_Rc)
print(M_Ic)
print(M_c_square)

tensor([-5.4868e-03, -8.6788e-03,  6.4644e-03,  1.0670e-02, -1.6795e-02,
         4.5687e-03, -1.0942e-03, -1.6698e-03, -6.8187e-03, -5.5373e-03,
        -1.1018e-02, -6.4783e-03, -7.6258e-02, -1.6787e-02, -1.1233e-01,
        -9.6085e-02, -1.2035e-01,  1.0264e-01,  1.3647e-02, -2.8443e-02,
        -1.2698e-02,  2.0808e-02,  9.7335e-03,  2.6076e-02, -1.8474e-02,
        -2.7175e-02, -2.9536e-03,  3.8386e-02, -4.7203e-02,  6.6492e-04,
         3.2361e-05, -1.7807e-03, -1.5663e-03], dtype=torch.float64)
tensor([ 0.0000, -0.0132, -0.0005, -0.0065,  0.0041,  0.0018,  0.0008, -0.0005,
        -0.0121,  0.0057, -0.0001, -0.0266, -0.0168,  0.0419, -0.0183,  0.0122,
        -0.0234, -0.0173,  0.0172, -0.0397, -0.0220, -0.0599, -0.0278, -0.0145,
        -0.0381, -0.0086, -0.0117, -0.0300, -0.0017, -0.0090,  0.0046,  0.0037,
         0.0000], dtype=torch.float64)
tensor([ 0.0006,  0.0023,  0.0020,  0.0058,  0.0141,  0.0006, -0.0003,  0.0009,
         0.0121,  0.0031, -0.0009,  0.0244,  0.0328,  

In [12]:
def load_wav_to_torch(full_path):
    sampling_rate, data = read(full_path)
    return torch.DoubleTensor(data.astype(np.float32)), sampling_rate

audio_origin, sampling_rate = load_wav_to_torch('demo.wav')

magnitude, phase_origin = stft_fn.transform(audio_origin.unsqueeze(0))
print(magnitude.shape)

start_frame = 20

ground_magnitude = magnitude[:,:,start_frame+2:start_frame+6]
ground_audio_origin = audio_origin.unsqueeze(0)[:, start_frame * hop_length : (start_frame+7) * hop_length]
initial_guess_audio = ground_audio_origin + ground_audio_origin*torch.rand((1,7*hop_length))

print('hop size', hop_length)
newton_method = newton_method_solver.hop_7_solver(forward_basis, hop_length, channels, win_length)
ans = newton_method.solve(ground_magnitude, initial_guess_audio, n_iters=5)
print('\n')
print('ans   ', ans[0, :10])
print('origin', ground_audio_origin[0, :10])
print('error       ', newton_method.test(ans, ground_magnitude)[:10])
print('error origin', newton_method.test(ground_audio_origin, ground_magnitude)[:10])
print('ans    part 3', ans[0,3*hop_length:4*hop_length])
print('origin part 3', ground_audio_origin[0, 3*hop_length:4*hop_length])

torch.Size([1, 33, 6161])
hop size 16
Iter 4/5: Used times: 0.89

ans    tensor([-0.0032, -0.0005,  0.0053,  0.0019, -0.0037, -0.0019,  0.0048, -0.0013,
        -0.0010,  0.0029], dtype=torch.float64)
origin tensor([-0.0032, -0.0005,  0.0052,  0.0018, -0.0036, -0.0017,  0.0049, -0.0013,
        -0.0011,  0.0027], dtype=torch.float64)
error        tensor([-2.7767e-05,  5.4979e-05,  2.2266e-05,  4.9457e-06,  1.9039e-05,
        -1.6362e-05,  6.2120e-06, -5.8554e-06, -2.4094e-05,  3.2937e-05],
       dtype=torch.float64)
error origin tensor([ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00, -3.4694e-18,
         3.4694e-18,  2.1684e-18, -3.4694e-18, -3.4694e-18,  2.6021e-18],
       dtype=torch.float64)
ans    part 3 tensor([ 2.1559e-03,  6.6631e-03, -1.9603e-03, -5.4287e-03,  2.9059e-03,
         4.9139e-03, -5.2472e-05, -6.8190e-03,  4.2392e-03,  4.9341e-03,
        -5.8091e-03, -2.7459e-04,  9.5751e-04,  1.0057e-03,  1.7753e-03,
        -1.7715e-03], dtype=torch.float64)
origin part 

In [24]:
guess = torch.tensor([2,1], dtype=torch.float64, requires_grad = True) 

# function to optimize
def my_func(x):
    ans = torch.zeros((3), dtype=torch.float64)
    ans[0] = x[0] - 2*x[1]
    ans[1] = x[0]*x[1]-1
    ans[2] = x[0]+x[1]- np.sqrt(2)*1.5
    return ans

def newton(func, guess, runs=5): 
    for _ in range(runs): 
        # evaluate our function with current value of `guess`
        J = torch.autograd.functional.jacobian(my_func, guess)
            # print('J shape', J.shape)
           
        J = J.squeeze(1)
        # print(J)
            
        Q, R = np.linalg.qr(J, mode='reduced')
        # print(Q, R)
        Qb = np.matmul(Q.T, my_func(guess).detach().numpy())
        minus = np.linalg.solve(R,Qb)
        
        # Q, R = np.linalg.qr(J, mode='reduced')
        # # print(Q, R)
        # Qb = np.matmul(Q.T, [value[0].detach().numpy(), value[1].detach().numpy()])
        # minus = np.linalg.solve(R,Qb)
        # update our `guess` based on the gradient
        guess.data -= minus
        # zero out current gradient to hold new gradients in next iteration 
        
    return guess.data # return our final `guess` after 5 updates

# call starts
result = newton(my_func, guess)

# output of `result`
print(result)

tensor([1.4142, 0.7071], dtype=torch.float64)
