In [1]:
import IPython.display as ipd

import numpy as np
import torch
import matplotlib.pyplot as plt
import torch.nn.functional as F

import stft
from audio_processing import griffin_lim

from scipy.io.wavfile import read

#### Setup hparams

In [2]:
class hparams_class:
    def __init__(self):
        self.temp=False
hparams = hparams_class()
hparams.sampling_rate = 22050

In [3]:
def load_wav_to_torch(full_path):
    sampling_rate, data = read(full_path)
    return torch.FloatTensor(data.astype(np.float32)), sampling_rate

audio_origin, sampling_rate = load_wav_to_torch('demo.wav')
print(sampling_rate)
audio_origin = audio_origin.unsqueeze(0)
ipd.Audio(audio_origin[0].data.cpu().numpy(), rate=hparams.sampling_rate)

22050


In [4]:
stft_fn = stft.STFT(filter_length=1024, hop_length=256, win_length=1024,
                    window='hann')


def compare(a,b):
    return torch.mean(torch.abs(a-b)), torch.mean((a-b)*(a-b))

def compare_L1(ori,gen):
    return torch.mean(torch.abs(ori-gen)/torch.abs(ori))


def compare_L2(a,b):
    return torch.sum(torch.abs(a-b)), torch.sum((a-b)*(a-b))

In [5]:
magnitude, phase_origin = stft_fn.transform(audio_origin)
print(magnitude.shape)

reconstruction = stft_fn.inverse(magnitude, phase_origin)
print(compare(reconstruction, audio_origin))
ipd.Audio(reconstruction[0].data.cpu().numpy(), rate=hparams.sampling_rate)

torch.Size([1, 513, 386])
(tensor(1.8312e-08), tensor(9.1903e-16))


# get x from Spectrogram

In [6]:


input_data = audio_origin.view(audio_origin.size(0), 1, audio_origin.size(1))
input_data = F.pad(
            input_data.unsqueeze(1),
            (int(stft_fn.filter_length / 2), int(stft_fn.filter_length / 2), 0, 0),
            mode='reflect')
input_data = input_data.squeeze(1)
input_data = input_data.numpy()
print(input_data.shape)

(1, 1, 99584)


In [7]:

class get_audio_from_spectrogram:
    
    def __init__(self,stft_fn):
        self.stft_fn = stft_fn
        # c=0,.....,255 in equation (4) could not form a full-rank matrix!
        # we use c=0,2,4,6,...,510 to get the full-rank(256) matirx and inverse
        
        self.cos_matrix = np.zeros((256,256), dtype=np.double)
        for c in range(256):
            for i in range(256):
                self.cos_matrix[i,c] = np.cos(2*np.pi*(2*c)*i/1024, dtype=np.double)
                
        # compute inverse of matrix COS
        self.inv_coefficient_solver = np.linalg.inv(self.cos_matrix)
#         self.inv_coefficient_solver = np.zeros((256,256), dtype=np.double)
#         for i in range(1,256):
#             self.inv_coefficient_solver[i,0]=(i%2)/128
#         for j in range(1,256):
#             self.inv_coefficient_solver[0,j]=(j%2)/128
    
#         for i in range(1,256):
#             for j in range(1,256):
#                 self.inv_coefficient_solver[i,j]=(np.cos(np.pi*i*j/256) + 2*((i+j)%2) -1)/128
        
        # compute how to use inverse matrix to get coefficient a-matrix
        # target_matrix: c=1,3,5,7,9,....,511,512
        self.target_matrix = np.zeros((256,257), dtype=np.double)
        for c in range(256):
            for i in range(256):
                self.target_matrix[i,c] = np.cos(2*np.pi*(2*c+1)*i/1024, dtype=np.double)
        for i in range(256):
            self.target_matrix[i,256] = np.cos(2*np.pi*(512)*i/1024, dtype=np.double)
        self.coefficient_a_matrix = np.matmul(self.inv_coefficient_solver, self.target_matrix)  #shape [256,257]
        
    def calculate_part_audio(self, magnitude, prev768):
        # magnitude shape: [1, 513]
        # prev_768  shape: [1, 768]
        # output: guess next 256. shape [256]
        
        R_Tc = torch.sum(self.stft_fn.forward_basis[:513,0,0:768]  * prev768, dim=1)
        I_Tc = torch.sum(self.stft_fn.forward_basis[513:,0,0:768]  * prev768, dim=1)
        M_Tc = magnitude[0,:]
        # R_Tc, I_Tc, M_Tc shape: [513]
        
        Constant_origin_equation = R_Tc**2 + I_Tc**2 - M_Tc**2
        Constant_origin_equation = Constant_origin_equation.unsqueeze(1)
        # Constant shape: [513, 1]
        
        A_origin_equation = 2 * R_Tc.unsqueeze(1) * self.stft_fn.forward_basis[:513,0,768:] + \
                            2 * I_Tc.unsqueeze(1) * self.stft_fn.forward_basis[513:,0,768:]
        # A shape: [513, 256]
        
        base_constant = Constant_origin_equation[:512:2,:] #shape [256,1]
        base_A        = A_origin_equation[:512:2,:]        #shape [256,256]
        
        constant = np.zeros((257,1), dtype=np.double)
        A        = np.zeros((257,256), dtype=np.double) 
        constant[:256,:] = Constant_origin_equation[1:512:2,:]
        A[:256,:]        = A_origin_equation[1:512:2,:]
        constant[256,:] = Constant_origin_equation[512,:]
        A[256,:]        = A_origin_equation[512,:]
        
        constant = constant - np.matmul(self.coefficient_a_matrix.T, base_constant.numpy())
        A        = A        - np.matmul(self.coefficient_a_matrix.T, base_A.numpy())
#         test_result= constant + np.matmul(A,test.T)
#         print(constant)
#         print(test_result.T)
#         print(test)
        
        # solve constant + Ax=0
        # solve Ax= -constant
        Q, R = np.linalg.qr(A, mode='reduced')
        #print(Q.shape, R.shape)
        Qb = np.matmul(Q.T,-constant)
        result = np.linalg.solve(R,Qb)
        
        # result shape: [256,1]
        return result
    

In [8]:

TEST_T = 5
START = TEST_T*stft_fn.hop_length
solution = get_audio_from_spectrogram(stft_fn)

result = solution.calculate_part_audio(magnitude[:,:,TEST_T], input_data[0,:,START:START+768], input_data[0,:,START+768:START+1024])
print(result.shape)
print(result.squeeze(1)[:20])
print(input_data[0,0,START+768:START+1024][:20])

TypeError: calculate_part_audio() takes 3 positional arguments but 4 were given

In [None]:
generate = np.zeros(input_data.shape)
generate[0,0,:768] = input_data[0,0,:768]

def loss_l1(a,b):
    return np.sum(np.abs(a-b))
    

for i in range(10):
    START = 256*i
    result = solution.calculate_part_audio(magnitude[:,:,i], generate[0,:,START:START+768])
    result = result
    generate[0,0,START+768:START+1024] = result[:,0]
    
    print(loss_l1(result[:,0], input_data[0,0,START+768:START+1024]))


In [None]:
generate = np.zeros(input_data.shape)
generate[0,0,:768] = np.round(input_data[0,0,:768], decimals=3)
print(loss_l1(generate[0,0,:768], input_data[0,0,:768]))

def loss_l1(a,b):
    return np.sum(np.abs(a-b))
    

for i in range(10):
    START = 256*i
    result = solution.calculate_part_audio(magnitude[:,:,i], generate[0,:,START:START+768])
    result = np.round(result, decimals=3)
    generate[0,0,START+768:START+1024] = result[:,0]
    
    print(loss_l1(result[:,0], input_data[0,0,START+768:START+1024]))


In [9]:
def griffin_lim(magnitudes, stft_fn, n_iters=30):
    """
    PARAMS
    ------
    magnitudes: spectrogram magnitudes
    stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
    """

    angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
    angles = angles.astype(np.float32)
    angles = torch.autograd.Variable(torch.from_numpy(angles))
    signal = stft_fn.inverse(magnitudes, angles).squeeze(1)

    for i in range(n_iters):
        _, angles = stft_fn.transform(signal)
        signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
    return signal

audio_griffin_lim = griffin_lim(magnitude, stft_fn)
print(audio_griffin_lim.shape)
audio_griffin_lim_numpy = audio_griffin_lim.data.cpu().numpy() 

ipd.Audio(audio_griffin_lim[0].data.cpu().numpy(), rate=hparams.sampling_rate)


torch.Size([1, 98560])


In [10]:


generate = np.zeros(input_data.shape)
generate[0,0,:768] = audio_griffin_lim_numpy[0,:768]

for i in range(300):
    START = 256*i
    result = solution.calculate_part_audio(magnitude[:,:,i], audio_griffin_lim_numpy[:,START:START+768])
    generate[0,0,START+768:START+1024] = result[:,0]
    

ipd.Audio(generate[0,0,:], rate=hparams.sampling_rate)

In [11]:


generate = np.zeros(input_data.shape)
generate[0,0,:768] = audio_griffin_lim_numpy[0,:768]

for i in range(1):
    START = 256*i
    result = solution.calculate_part_audio(magnitude[:,:,i], audio_griffin_lim_numpy[:,START:START+768])
    generate[0,0,START+768:START+1024] = result[:,0]
    
print(audio_griffin_lim_numpy[0,768:768+100])
print(generate[0,0,768:768+100])

[ 1.14124427e-02 -2.91190553e-03  1.56146439e-03 -2.00467021e-03
  1.55488658e-03  1.15136243e-02 -2.26772781e-02  1.53932506e-02
  5.82800433e-03 -5.83794760e-03 -6.88909227e-03 -4.67700447e-04
  2.40725391e-02 -1.68732926e-02 -8.58729240e-03  8.46820977e-03
  7.94036966e-03 -1.44876179e-03 -8.22840817e-03  3.80222383e-03
  1.73907680e-03 -6.88750995e-04 -2.18108227e-03  9.97956842e-03
 -1.20214617e-03 -9.90671199e-03 -4.96619214e-05  1.40863955e-02
  7.07171785e-05 -9.21646412e-03 -1.39537535e-03  7.51362741e-03
  3.63035686e-03 -1.30337263e-02  9.59113520e-03  5.12953475e-03
 -7.52981799e-03  3.38933058e-03 -7.63671612e-03  4.27691685e-03
  1.57672912e-02 -9.69038624e-03 -1.51696019e-02  7.13224383e-03
  1.50438314e-02 -5.31373359e-03 -1.33973509e-02  2.27567437e-03
  8.20069760e-03 -7.07079525e-05 -4.59697004e-03 -3.45796184e-03
  3.91599862e-03 -2.95114750e-03  7.21413456e-03 -1.91109837e-04
 -1.09074330e-02  9.80657805e-03 -2.21806113e-03 -1.01215707e-03
  1.19248340e-02 -2.08413