# STFT based Wiener filter with forgetting factor

In [None]:
import helper
import paths
import processing
import algorithms

import librosa
import librosa.core as lc
import matplotlib.pyplot as plt
import numpy as np
import scipy.signal as sc
from scipy.io.wavfile import write
import IPython

%reload_ext autoreload
%autoreload 2

# Case 1: Delayed inputs, speech on top

In [None]:
filename = 'ladida'
speech_filename = 'alexa'

# Noise signal

In [None]:
signal_0, signal_1, fs = helper.load_separate_channels(filename)
IPython.display.Audio(data = signal_0, rate = fs)

# Speech signal

In [None]:
speech_signal, _, fs = helper.load_separate_channels(speech_filename)
IPython.display.Audio(data = speech_signal, rate = fs)

First we load the signal into two different signals being the two channels.

Then we apply a **delay** similar to the one we would roughly have if the two microphones were separated by a certain distance (`6cm` here) by taking into account the sampling frequency (`32000 Hz` here).

On top of that, we add a **speech signal** at the same position for both inputs.

The current scenario represents a simple case in which the two inputs are only delayed one from another, the speech signal has no delay (we suppose that the speaker is in front of both microphones), and the same channel is applied to both (which is obviously not a realistic case).

In [None]:
signal_0, signal_1 = processing.add_delay(0.06, fs, signal_0)
signal_0, signal_1 = processing.add_speech('alexa', signal_0, signal_1, amp=1, position='mid')
helper.plot_signals(signal_0, signal_1, 'The two input signals after preparing the setup', 'signal_0', 'signal_1')

In [None]:
IPython.display.Audio(data = signal_0, rate = fs)

Next we compute the **STFT** for the two signals, with a pre-chosen `window_size` and `hop_percentage` which is the percentage of `window_size` we want to jump over for each consecutive frame.

## Short Time Fourier Transform

In [None]:
Zxx_0, Zxx_1, n_freqs, n_frames, hop_length = helper.dual_stft(signal_0, 
                                                               signal_1, 
                                                               window_size=4096, 
                                                               hop_percentage=50)

## Run the algorithm

Now we can run the **STFT based Wiener filter with forgetting factor** algorithm with the parameters we choose.

In [None]:
Zxx_1_estimate = algorithms.stft_wiener_filter(Zxx_0, Zxx_1, alpha=0.008)

In [None]:
epsilon = Zxx_1 - Zxx_1_estimate
epsilon_time = lc.istft(epsilon, hop_length=hop_length)

In [None]:
helper.plot_signals(signal_0, epsilon_time, 'The input and output signals', 'input', 'output')

Below are the spectrograms of the input signal and the output signal. You should be able to see the drop of magnitude after d frames in the second plot, which is the number of frames required to analyse the noise and then filter the signal with a delay of the deferred coefficients.

In [None]:
helper.spectrogram(Zxx_0, epsilon, 1/8)

In [None]:
IPython.display.Audio(data = epsilon_time, rate = fs)

# Case 2: Delayed inputs, speech on top, filtering with 2 Room Impulse Responses

# Noise signal

In [None]:
signal_0, signal_1, fs = helper.load_separate_channels(filename)
IPython.display.Audio(data = signal_0, rate = fs)

# Speech signal

In [None]:
speech_signal, _, fs = helper.load_separate_channels(speech_filename)
IPython.display.Audio(data = speech_signal, rate = fs)

We apply a **delay** similar to the one we would roughly have if the two microphones were separated by a certain distance (`6cm` here) by taking into account the sampling frequency (`32000 Hz` here).

On top of that, we add a **speech signal** at the same position for both inputs.

Finally, we filter the first signal (the one wich has "no delay" introduced) with a `close_mic_rir` **Room Impulse Response** (RIR), simulating a close microphone, and the second signal with a `far_mic_rir` **Room Impulse Response** (RIR), simulating a far microphone.

In [None]:
signal_0, signal_1 = processing.add_delay(0.06, fs, signal_0)
signal_0, signal_1 = processing.add_speech('alexa', signal_0, signal_1, amp=1, position='mid')
signal_0 = processing.rir_filter('close_mic_rir', 44100, signal_0, resampling_f=fs)
signal_1 = processing.rir_filter('far_mic_rir', 44100, signal_1, resampling_f=fs)
helper.plot_signals(signal_0, signal_1, 'The two input signals after preparing the setup', 'signal_0', 'signal_1')

In [None]:
IPython.display.Audio(data = signal_1, rate = fs)

## Short Time Fourier Transform

In [None]:
Zxx_0, Zxx_1, n_freqs, n_frames, hop_length = helper.dual_stft(signal_0, 
                                                               signal_1, 
                                                               window_size=8192, 
                                                               hop_percentage=25)

## Run the algorithm

In [None]:
Zxx_1_estimate = algorithms.stft_wiener_filter(Zxx_0, Zxx_1, alpha=0.00008)

In [None]:
epsilon = Zxx_1 - Zxx_1_estimate
epsilon_time = lc.istft(epsilon, hop_length=hop_length)

In [None]:
helper.plot_signals(signal_0, epsilon_time, 'The input and output signals', 'input', 'output')

In [None]:
helper.spectrogram(Zxx_0, epsilon, 1/8)

In [None]:
IPython.display.Audio(data = epsilon_time, rate = fs)

# Case 3: Original signals from the recording, and speech on top

# Noise signal

In [None]:
signal_0, signal_1, fs = helper.load_separate_channels(filename)
IPython.display.Audio(data = signal_0, rate = fs)

# Speech signal

In [None]:
speech_signal, _, fs = helper.load_separate_channels(speech_filename)
IPython.display.Audio(data = speech_signal, rate = fs)

This time, we do not touch the original noise, and add the speech on top.

In [None]:
signal_0, signal_1 = processing.add_speech('alexa', signal_0, signal_1, amp=1, position='mid')

In [None]:
helper.plot_signals(signal_0, signal_1, 'The two input signals', 'signal_0', 'signal_1')

In [None]:
IPython.display.Audio(data = signal_1, rate = fs)

## Short Time Fourier Transform

In [None]:
Zxx_0, Zxx_1, n_freqs, n_frames, hop_length = helper.dual_stft(signal_0, 
                                                               signal_1, 
                                                               window_size=8192, 
                                                               hop_percentage=25)

## Run the algorithm

In [None]:
Zxx_1_estimate = algorithms.stft_wiener_filter(Zxx_0, Zxx_1, alpha=0.008)

In [None]:
epsilon = Zxx_1 - Zxx_1_estimate
epsilon_time = lc.istft(epsilon, hop_length=hop_length)

In [None]:
helper.plot_signals(signal_0, epsilon_time, 'The input and output signals', 'input', 'output')

In [None]:
helper.spectrogram(Zxx_0, epsilon, 1/8)

In [None]:
IPython.display.Audio(data = epsilon_time, rate = fs)

# Case 4: Original signals from the recording containing the speech signal

# Noise signal

In [None]:
filename = 'ladida_real'
signal_0, signal_1, fs = helper.load_separate_channels(filename)
IPython.display.Audio(data = signal_0, rate = fs)

This time, the signals already contain the speech signal. Everything was recorded together.

In [None]:
helper.plot_signals(signal_0, signal_1, 'The two input signals', 'signal_0', 'signal_1')

In [None]:
IPython.display.Audio(data = signal_1, rate = fs)

## Short Time Fourier Transform

In [None]:
Zxx_0, Zxx_1, n_freqs, n_frames, hop_length = helper.dual_stft(signal_0, 
                                                               signal_1, 
                                                               window_size=32000, 
                                                               hop_percentage=10)

## Run the algorithm

In [None]:
Zxx_1_estimate = algorithms.stft_wiener_filter(Zxx_0, Zxx_1, alpha=0.0000001)

In [None]:
epsilon = Zxx_1 - Zxx_1_estimate
epsilon_time = lc.istft(epsilon, hop_length=hop_length)

In [None]:
helper.plot_signals(signal_0, epsilon_time, 'The input and output signals', 'input', 'output')

In [None]:
helper.spectrogram(Zxx_0, epsilon, 1/8)

In [None]:
IPython.display.Audio(data = epsilon_time, rate = fs)