In [12]:
import librosa
import soundfile
import numpy as np
import scipy
import matplotlib.pyplot as plt
from librosa import feature
from librosa import display
from torchaudio.transforms import Resample
import torch
import torchaudio
import soundfile
from scipy.io import wavfile

basta = 'audio_16k/Basta_16k.wav'
################### Part A1 ###################

# basta_song, sr = librosa.load(basta)
basta_song, sr = soundfile.read(basta)
audio = basta_song.reshape(1, 1, len(basta_song), 2)
audio = torch.from_numpy(audio)

import torch
import librosa


######################## Part A: Interpolating over time ########################
def read_audio():
    file_path = "audio_16k/Basta_16k.wav"
    audio, sr = soundfile.read(file_path)
    audio = audio.reshape(1, 1, len(audio), 2)
    audio = torch.from_numpy(audio)
    return audio


def interpolate_audio(audio, scale_factor, path):
    streched = torch.nn.functional.interpolate(audio, scale_factor=scale_factor, mode='bilinear')
    streched = streched.squeeze()
    soundfile.write(path, streched, sr)


audio = read_audio()
interpolate_audio(audio=audio, scale_factor=1.2, path="outputs/interpolation_1_2.wav")
interpolate_audio(audio=audio, scale_factor=0.8, path="outputs/interpolation_0_8.wav")

In [13]:
######################## part B: Naive time stretch ########################


def naive_tempo_shift(wav, factor):
    n_fft = 512
    hop_length = n_fft / 4
    stft = torch.stft(wav,n_fft=n_fft, return_complex=True)
    return torch.istft(stft , n_fft=n_fft, hop_length=int(hop_length / factor),
                       return_complex=False)

def read_audio2():
    file_path = "audio_16k/Basta_16k.wav"
    audio, sr = librosa.load(file_path)
    audio = torch.from_numpy(audio)
    return audio


audio = read_audio2()
compress = naive_tempo_shift(audio, 0.8)
stretched = naive_tempo_shift(audio, 1.2)

soundfile.write("outputs/naive_pitch_shift_0_8.wav", compress, sr)
soundfile.write("outputs/naive_pitch_shift_1_2.wav", stretched, sr)

In [14]:
def construct_hann_window(win_size):
    # return a vector representing a hanning window, hint: see torch.hann_window
    return torch.hann_window(win_size)

def get_complex_stft(signal, win_size, hop, window):
    # return a complex representation of the stft (x + jy form)
    return torch.stft(signal, n_fft=win_size, hop_length=hop, window=window,
                      return_complex=True)

def get_acc_phase_delta(stft_left, stft_right):
    # calculate angular distance between two complex STFTs
    # phase_delta = angle(stft_right) - angle(stft_left)
    phase_delta = (torch.angle(stft_right) - torch.angle(stft_left))
    phase = torch.zeros(phase_delta.shape, dtype=torch.complex64)

    phase[0] = phase_delta[0]
    # accumulate phase, follow this recursive formula
    # for i in {1...length(phase_delta)}: phase[i] := phase_delta[i] + phase[i-1]; phase[0] = phase_delta[0]
    for i in range(1, len(phase_delta)):
        phase[i] = phase_delta[i] + phase[i-1]

    # round phase back to [-2 * pi, 2 * pi] range
    phase = phase  - (2 * torch.pi * torch.round(phase_delta / (2 * torch.pi)))

    return phase

def get_re_im_from_phase(phase):
    # retrieves the real and imaginary components from a complex phase
    return torch.real(phase), torch.imag(phase)

def time_stretch(signal, factor, win_size=1024, hop=1024//4):
    # create window
    hann_window = construct_hann_window(win_size)

    # draw two complex STFTs
    new_hop = int(hop * factor)
    stft_left = get_complex_stft(signal[:-hop], win_size, new_hop, hann_window)
    stft_right = get_complex_stft(signal[hop:], win_size, new_hop, hann_window)

    # calculate accumulated phase delta and reconstruct phase from it
    phase = get_acc_phase_delta(stft_left, stft_right)

    # reconstruct component from phase
    re, im = get_re_im_from_phase(phase)
    complex_new_stft = torch.view_as_complex(torch.stack([re, im], dim=-1)) * torch.abs(stft_right)
    output = torch.istft(complex_new_stft, n_fft=win_size, hop_length=hop,window=hann_window)

    return output

file_path = "audio_16k/Basta_16k.wav"
audio, sr = librosa.load(file_path)
audio = torch.from_numpy(audio)

time_stretched_0_8 = time_stretch(audio, 0.8)
soundfile.write("outputs/phase_vocoder_0_8.wav", time_stretched_0_8, sr)
time_stretched_1_2 = time_stretch(audio, 1.2)
soundfile.write("outputs/phase_vocoder_1_2.wav", time_stretched_1_2, sr)