In [1]:
import numpy as np
import math
from PIL import Image
import time

# Packages we're using
import numpy as np
import matplotlib.pyplot as plt
import copy
from scipy.io import wavfile
from scipy.signal import butter, lfilter
import scipy.ndimage
from tqdm import tqdm

%matplotlib inline
import IPython.display
from ipywidgets import interact, interactive, fixed

# Packages we're using
import numpy as np
import matplotlib.pyplot as plt
import copy
from scipy.io import wavfile
from scipy.signal import butter, lfilter
import scipy.ndimage

import glob
import os
from tqdm import tqdm
# plt, plot, tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
FFT_LENGTH = 512
WINDOW_LENGTH = 512
WINDOW_STEP = int(WINDOW_LENGTH / 2)
phaseMax = 3.141592653589793 
phaseMin = -3.141592653589793
magnitudeMax = 3212111.6378762764 # 5432292.907520762
magnitudeMin = 0.0

In [3]:
def amplifyMagnitudeByLog(d):
    return 188.301 * math.log10(d + 1)

def weakenAmplifiedMagnitude(d):
    return math.pow(10, d/188.301)-1

def generateLinearScale(magnitudePixels, phasePixels, 
                        magnitudeMin, magnitudeMax, phaseMin, phaseMax):
    height = magnitudePixels.shape[0]
    width = magnitudePixels.shape[1]
    magnitudeRange = magnitudeMax - magnitudeMin
    phaseRange = phaseMax - phaseMin
    rgbArray = np.zeros((height, width, 3), 'uint8')
    
    for w in range(width):
        for h in range(height):
            magnitudePixels[h,w] = (magnitudePixels[h,w] - magnitudeMin) / (magnitudeRange) * 255 * 2
            magnitudePixels[h,w] = amplifyMagnitudeByLog(magnitudePixels[h,w])
            phasePixels[h,w] = (phasePixels[h,w] - phaseMin) / (phaseRange) * 255
            red = 255 if magnitudePixels[h,w] > 255 else magnitudePixels[h,w]
            green = (magnitudePixels[h,w] - 255) if magnitudePixels[h,w] > 255 else 0
            blue = phasePixels[h,w]
            rgbArray[h,w,0] = int(red)
            rgbArray[h,w,1] = int(green)
            rgbArray[h,w,2] = int(blue)
    return rgbArray

def recoverLinearScale(rgbArray, magnitudeMin, magnitudeMax, 
                       phaseMin, phaseMax):
    width = rgbArray.shape[1]
    height = rgbArray.shape[0]
    magnitudeVals = rgbArray[:,:,0].astype(float) + rgbArray[:,:,1].astype(float)
    phaseVals = rgbArray[:,:,2].astype(float)
    phaseRange = phaseMax - phaseMin
    magnitudeRange = magnitudeMax - magnitudeMin
    for w in range(width):
        for h in range(height):
            phaseVals[h,w] = (phaseVals[h,w] / 255 * phaseRange) + phaseMin
            magnitudeVals[h,w] = weakenAmplifiedMagnitude(magnitudeVals[h,w])
            magnitudeVals[h,w] = (magnitudeVals[h,w] / (255*2) * magnitudeRange) + magnitudeMin
    return magnitudeVals, phaseVals

In [4]:
def generateSpectrogramForWave(signal):
    start_time = time.time()
    global magnitudeMin
    global magnitudeMax
    global phaseMin
    global phaseMax
    buffer = np.zeros(int(signal.size + WINDOW_STEP - (signal.size % WINDOW_STEP)))
    buffer[0:len(signal)] = signal
    height = int(FFT_LENGTH / 2 + 1)
    width = int(len(buffer) / (WINDOW_STEP) - 1)
    magnitudePixels = np.zeros((height, width))
    phasePixels = np.zeros((height, width))

    for w in range(width):
        buff = np.zeros(FFT_LENGTH)
        stepBuff = buffer[w*WINDOW_STEP:w*WINDOW_STEP + WINDOW_LENGTH]
        # apply hanning window
        stepBuff = stepBuff * np.hanning(WINDOW_LENGTH)
        buff[0:len(stepBuff)] = stepBuff
        #buff now contains windowed signal with step length and padded with zeroes to the end
        fft = np.fft.rfft(buff)
        for h in range(len(fft)):
            #print(buff.shape)
            #return
            magnitude = math.sqrt(fft[h].real**2 + fft[h].imag**2)
            if magnitude > magnitudeMax:
                magnitudeMax = magnitude 
            if magnitude < magnitudeMin:
                magnitudeMin = magnitude 

            phase = math.atan2(fft[h].imag, fft[h].real)
            if phase > phaseMax:
                phaseMax = phase
            if phase < phaseMin:
                phaseMin = phase
            magnitudePixels[height-h-1,w] = magnitude
            phasePixels[height-h-1,w] = phase
    rgbArray = generateLinearScale(magnitudePixels, phasePixels,
                                  magnitudeMin, magnitudeMax, phaseMin, phaseMax)
    
    
    elapsed_time = time.time() - start_time
    print('%.2f' % elapsed_time, 's', sep='')
    img = Image.fromarray(rgbArray, 'RGB')
    return img

In [5]:
def recoverSignalFromSpectrogram(numpyarray):
#     img = Image.open(filePath)
    data = np.array( numpyarray, dtype='uint8' )
    width = data.shape[1]
    height = data.shape[0]

    magnitudeVals, phaseVals \
    = recoverLinearScale(data, magnitudeMin, magnitudeMax, phaseMin, phaseMax)
    
    recovered = np.zeros(WINDOW_LENGTH * width // 2 + WINDOW_STEP, dtype=np.int16)
    recovered = np.array(recovered,dtype=np.int16)
    
    for w in range(width):
        toInverse = np.zeros(height, dtype=np.complex_)
        for h in range(height):
            magnitude = magnitudeVals[height-h-1,w]
            phase = phaseVals[height-h-1,w]
            toInverse[h] = magnitude * math.cos(phase) + (1j * magnitude * math.sin(phase))
        signal = np.fft.irfft(toInverse)
        recovered[w*WINDOW_STEP:w*WINDOW_STEP + WINDOW_LENGTH] += signal[:WINDOW_LENGTH].astype(np.int16)
    return recovered

In [6]:
rate = 16000

In [7]:
print("phaseMax {} phaseMin {} magnitudeMax {} magnitudeMin {}".format(phaseMax,phaseMin,magnitudeMax,magnitudeMin))

phaseMax 3.141592653589793 phaseMin -3.141592653589793 magnitudeMax 3212111.6378762764 magnitudeMin 0.0


In [8]:
path_Voice = './Voice'

In [9]:
 # Imports
from scipy.io import wavfile
import scipy.signal as sps

# Your new sampling rate
new_rate = 16000

for filename in tqdm(glob.glob(os.path.join(path_Voice, '*.wav'))):
    sampling_rate, data = wavfile.read(filename)
    number_of_samples = round(len(data) * float(new_rate) / sampling_rate)
    data = sps.resample(data, number_of_samples)
    data = np.asarray(data, dtype=np.int16)
    wavfile.write(filename,new_rate,data)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 19.28it/s]


In [10]:
# Read All the lenght of wav file - Process it for 1 sec and queue up for prediction

In [11]:
# Implement

In [12]:
from pydub import AudioSegment
from pydub.utils import make_chunks

In [13]:
filename = glob.glob(os.path.join(path_Voice, '*.wav'))
print(filename)

['./Voice\\Voice 002.wav']


In [14]:
for filename in tqdm(glob.glob(os.path.join(path_Voice, '*.wav'))):
    myaudio = AudioSegment.from_file(filename) 
    chunk_length_ms = 1000 # pydub calculates in millisec
    chunks = make_chunks(myaudio, chunk_length_ms) #Make chunks of one sec
    
    for i, chunk in enumerate(chunks):
        
        chunk_name = "chunk{0}.wav".format(i)
        name = path_Voice+'/'+chunk_name
        
        chunk.export(name, format="wav")
        
        rate, data = wavfile.read(path_Voice+'/'+chunk_name)
        
        
        if len(data.shape) >= 2 and data.size > 0:
            if data.shape[-1] > 1:
                data = data.mean(axis=-1)
            else:
                data = np.reshape(data, data.shape[:-1])
        img = generateSpectrogramForWave(data)
        np.save(path_Voice+'/'+chunk_name[:-4]+'.npy', img) # save

  0%|                                                                                                                                                                                     | 0/1 [00:00<?, ?it/s]

0.19s


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.90it/s]

0.19s
0.13s





In [15]:
import tensorflow as tf
from tensorflow import keras

In [16]:
new_model = tf.keras.models.load_model('model.h5')

# Check its architecture
new_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 257, 62, 3)] 0                                            
__________________________________________________________________________________________________
block_1_conv (Conv2D)           (None, 257, 62, 64)  1792        input_2[0][0]                    
__________________________________________________________________________________________________
block_1_lrelu (LeakyReLU)       (None, 257, 62, 64)  0           block_1_conv[0][0]               
__________________________________________________________________________________________________
block_1_drop (Dropout)          (None, 257, 62, 64)  0           block_1_lrelu[0][0]              
____________________________________________________________________________________________

In [17]:
ROW = 257
COL = 62

test_pred = []

In [19]:
count = 0

for filename in tqdm(glob.glob(os.path.join(path_Voice, '*.npy'))):
    img_test = np.load(filename)
    row_,col_,_ = img_test.shape
    
    if col_ < COL:
        break
    
    print(img_test.shape)
    
    img_test = img_test/255
    img_test = img_test.reshape(-1, ROW,COL,3)
    decoded_imgs = new_model.predict(img_test) #predict
    decoded_imgs = decoded_imgs.reshape(ROW,COL,3)
    decoded_imgs = decoded_imgs*255
    decoded_imgs = decoded_imgs.astype(np.int16)
    data = recoverSignalFromSpectrogram(decoded_imgs)
    file = './'+"testpred_{}".format(count)+'.wav'
    scipy.io.wavfile.write(file, rate, data)
    test_pred.append(file)
    count = count+1

In [None]:
test_pred

In [None]:
from pydub import AudioSegment

In [None]:
sound = 0

In [None]:
for i in range(len(test_pred)):
    print(test_pred[i])
    sound += AudioSegment.from_wav(test_pred[i])
sound.export("./soundJoined.wav", format="wav")