# Speech Recognition I

## Setup

In [None]:
%%bash
mkdir -p week_04_asr_files
wget -O week_04_asr_files/test_matrix.txt -q https://raw.githubusercontent.com/yandexdataschool/speech_course/main/week_04_asr/test_matrix.txt
wget -O week_04_asr_files/alphas.txt -q https://raw.githubusercontent.com/yandexdataschool/speech_course/main/week_04_asr/alphas.txt
wget -O week_04_asr_files/betas.txt -q https://raw.githubusercontent.com/yandexdataschool/speech_course/main/week_04_asr/betas.txt
wget -O week_04_asr_files/soft_alignment.txt -q https://raw.githubusercontent.com/yandexdataschool/speech_course/main/week_04_asr/soft_alignment.txt
wget -O week_04_asr_files/h035_Bar_LargeSportsBar_5.wav -q https://raw.githubusercontent.com/yandexdataschool/speech_course/main/week_04_asr/h035_Bar_LargeSportsBar_5.wav

In [None]:
import math
import os
import string

from typing import List

import editdistance
import librosa
import numpy as np
import torch
import torch.nn.functional as F
import torchaudio

from IPython import display
from matplotlib import pyplot as plt
from torch import distributions
from torch import nn

week_04_asr_files = 'week_04_asr_files'

## Metrics
### WER (Word Error Rate)
WER measures the difference between the reference transcript and the output transcript in terms of words. It is calculated as the minimum number of insertions, deletions, and substitutions required to transform the output into the reference, divided by the total number of words in the reference.

### CER (Character Error Rate)
CER, similar to WER, measures the error rate but at the character level. It calculates the minimum number of insertions, deletions, and substitutions required to transform the output into the reference, divided by the total number of characters in the reference.

### Implement WER and CER Calculation
Your task is to implement functions to calculate WER and CER using the editdistance library. Assume that both the reference and output transcripts are strings.

In [None]:
def calculate_wer(reference, hypothesis):
    """Calculates the Word Error Rate (WER).

    WER measures the difference between the reference transcript and the output
    transcript in terms of words. It is calculated as the minimum number of
    insertions, deletions, and substitutions required to transform the output
    into the reference, divided by the total number of words in the reference.

    Args:
    reference: The reference transcript.
    hypothesis: The output transcript.

    Returns:
    The WER as a float.
    """
    # YOUR CODE

    return wer


def calculate_cer(reference, hypothesis):
    """Calculates the Character Error Rate (CER).

    CER, similar to WER, measures the error rate but at the character level.
    It calculates the minimum number of insertions, deletions, and substitutions
    required to transform the output into the reference, divided by the total
    number of characters in the reference.

    Args:
    reference: The reference transcript.
    hypothesis: The output transcript.

    Returns:
    The CER as a float.
    """
    # YOUR CODE

    return cer


In [None]:
assert calculate_wer('hello world', 'hello world') == 0.0
assert calculate_wer('hello world', 'hello') == 1 / 2
assert calculate_wer('hello world', '') == 1.0
assert calculate_wer('hello world', 'hello hello hello hello') == 3 / 2

assert calculate_cer('hello', 'hello') == 0.0
assert calculate_cer('hello', 'hell') == 1 / 5
assert calculate_cer('hello', 'he') == 3 / 5
assert calculate_cer('hello', 'h') == 4 / 5
assert calculate_cer('hello', '') == 1.0
assert calculate_cer('hello', 'hhhhelloooo') == 6 / 5

## Sound Augmentations

In [None]:
def visualize_audio(wav: torch.Tensor, sr: int = 16000):
    plt.figure(figsize=(12, 4))
    plt.plot(wav, alpha=.7, c='blue')
    plt.grid()
    plt.xlabel('Time', size=10)
    plt.ylabel('Amplitude', size=10)
    plt.show()
    display.display(display.Audio(wav, rate=sr, normalize=False))

wav, sr = librosa.load(librosa.ex('brahms'), duration=10)
wav = torch.from_numpy(wav)
if wav.dim() == 2:
    wav = wav.mean(dim=0)
visualize_audio(wav, sr)

### Gaussian Noise

In [None]:
noiser = distributions.Normal(0, 0.05)
augumented_wav = wav + noiser.sample(wav.size())
visualize_audio(augumented_wav, sr)

### Time Stretching


In [None]:
augumented_wav = librosa.effects.time_stretch(wav.numpy().squeeze(), rate=2.0)
augumented_wav = torch.from_numpy(augumented_wav)
visualize_audio(augumented_wav, sr)

### Pitch Shifting

In [None]:
augumented_wav = librosa.effects.pitch_shift(wav.numpy().squeeze(), sr=sr, n_steps=-5)
augumented_wav = torch.from_numpy(augumented_wav)
visualize_audio(augumented_wav, sr)

### Volume

In [None]:
voler = torchaudio.transforms.Vol(gain=2.0, gain_type='amplitude')
augumented_wav = voler(wav)
visualize_audio(augumented_wav, sr)

### Impulse Response

RIR datasets - https://github.com/RoyJames/room-impulse-responses  
The example in the cell below is taken from the "MID IR Survey".

In [None]:
rir, rir_sr = torchaudio.load(os.path.join(week_04_asr_files, 'h035_Bar_LargeSportsBar_5.wav'))

In [None]:
def simulate(audio: torch.Tensor, rir: torch.Tensor):
    left_pad = right_pad = rir.shape[-1] - 1

    # Since torch.conv do cross-correlation (not convolution) we need to flip kernel
    flipped_rir = rir.squeeze().flip(0)

    audio = F.pad(audio, [left_pad, right_pad]).view(1, 1, -1)
    convolved_audio = torch.conv1d(audio, flipped_rir.view(1, 1, -1)) \
        .squeeze()

    # peak normalization
    if convolved_audio.abs().max() > 1:
        convolved_audio /= convolved_audio.abs().max()

    return convolved_audio

augumented_wav = simulate(wav, rir)
visualize_audio(augumented_wav, sr)


### Add Noise on Background

In [None]:
filename = librosa.ex('trumpet')
noise, _ = librosa.load(filename, sr=sr)
visualize_audio(torch.from_numpy(noise), sr)

In [None]:
noise_level = torch.Tensor([5])  # [0, 40]

noise_energy = torch.norm(torch.from_numpy(noise))
audio_energy = torch.norm(wav)

alpha = (audio_energy / noise_energy) * torch.pow(10, -noise_level / 20)

clipped_wav = wav[..., :noise.shape[0]]

augumented_wav = clipped_wav + alpha * torch.from_numpy(noise)
augumented_wav = torch.clamp(augumented_wav, -1, 1)

visualize_audio(augumented_wav, sr)

### SpecAugment
SpecAugment involves three types of augmentations: time warping, frequency masking, and time masking. Time warping warps the spectrogram along the time axis, while frequency masking and time masking involve masking random sections of the spectrogram in the frequency and time domains, respectively.

https://arxiv.org/abs/1904.08779

In [None]:
def spec_augment_masking(spectrogram, frequency_masking_factor, time_masking_factor):
    """SpecAugment involves three types of augmentations: time warping, frequency
    masking, and time masking. Frequency masking and time masking involve masking random sections
    of the spectrogram in the frequency and time domains, respectively.
    Time warping is not implemented.

    Args:
    spectrogram: The spectrogram to augment.
    frequency_masking_factor: The factor by which to mask the spectrogram in the
        frequency domain.
    time_masking_factor: The factor by which to mask the spectrogram in the
        time domain.

    Returns:
    The augmented spectrogram.
    """
    # Frequency masking
    # YOUR CODE

    # Time masking
    # YOUR CODE

    return spectrogram

In [None]:
mel_spectrogramer = torchaudio.transforms.MelSpectrogram(
    sample_rate=sr,
    n_fft=400,
    win_length=400,
    hop_length=200,
    n_mels=80,
)

mel_spectrogram = mel_spectrogramer(wav)
log_mel_spectrogram = torch.log(mel_spectrogram).squeeze()
plt.figure(figsize=(20, 8))
plt.imshow(log_mel_spectrogram)
plt.show()

In [None]:
augumented_log_mel_spectrogram = spec_augment_masking(log_mel_spectrogram, 0.3, 0.2)
plt.figure(figsize=(20, 8))
plt.imshow(augumented_log_mel_spectrogram)
plt.show()

## Lecture recap

### Problem statement

<p style="text-align:center;"><img src="http://drive.google.com/uc?export=view&id=1FFGXZsCgy-uQfCBp7F4w1gaJbIGv6CV2" height="200px" width="700px">  


Define a modified label sequence $\omega'_{1:2L + 1}$:
- add blanks to the beginning and the end of the original label sequence $\omega_{1:L}$
- insert blanks between every pair of labels

<p style="text-align:center;"><img src="http://drive.google.com/uc?export=view&id=1CEhWtVYrSSkaRtEsJr5QwiH8lMaSQ_uN" height="150px" width="400px">


Define $\alpha_t(s)$ as the probability of all paths of length $t$ which go through state $\omega_s'$:

Denote a sequence of **acoustic features** or **observations** as

$$
    \mathbf{X}_{1:T} = \{x_1, \ldots, x_T\}
$$

Define a mapping $\mathcal{M}$ between words $\mathbf{w}$ and speech units $\omega_{1:L}$:

$$
    \{\omega^{(q)}_{1:L_q}\}^Q_{q = 1} = \mathcal{M}(\mathbf{w})
$$

$$
    \{\mathbf{w}^{(p)}\}^P_{p = 1} = \mathcal{M}^{-1}(\omega_{1:L})
$$

For some choices of speech units this mapping is not 1-to-1 ($Q > 1$, $P > 1$). A possible pair of text (green) and speech units (yellow):

<p style="text-align:center;"><img src="http://drive.google.com/uc?export=view&id=1HWD_SFZzids3Nz67BK_NQ5awkw6yUvLo" height="200px" width="600px">

Automated speech recognition (ASR) is a **discriminative** task $\rightarrow$ "Which sequence $\mathbf{\hat w}$ is likely given the audio?":

$$
    \mathbf{\hat w} = \mathcal{M}^{-1}(\hat \omega_{1:L}), \quad \hat \omega_{1:L} = \arg \max_{\hat \omega_{1: L}} P(\hat \omega_{1:L} | \mathbf{X}_{1: T}; \theta),
$$

where $\theta$ denotes the parameters of the model we are building to solve the problem.

### Discriminative state-space models

How feature vectors $\mathbf{X}_{1: T}$ and speech units $\omega_{1:L}$ relate or **align** to each other? Two common approaches to constructing models which can align:
- state-space models
- neural attention mechanisms

State-space models represent the space of various alignments in the form of a table (called **trellis**), the rows of which correspond to phonemes, and the columns are observed variables. One alignment is the path in this table from the upper left corner to the lower right.

<p style="text-align:center;"><img src="http://drive.google.com/uc?export=view&id=1npycuLvYq_-3p_xd6bouR21tfVeOvMUd" height="300px" width="600px">

Denote a set of all paths in trellis that map onto the phoneme sequence $\omega_{1:L}$ as ${A}(\omega_{1:L})$, and let $\pi_{1:T} \in {A}(\omega_{1:L})$ be an element of this set. Then a discriminative state-space system models $P(\omega_{1:L} | \mathbf{X}_{1: T}; \theta)$ as

$$
    P(\omega_{1:L} | \mathbf{X}_{1: T}; \theta) = \sum_{\pi_{1:T} \in {A}(\omega_{1:L})} P(\pi_{1:T} | \mathbf{X}_{1:T}; \theta)
$$

Imagine that we have a recurrent neural network parametrized with $\theta$. The network outputs a distribution $P(z_t|x_t; \theta)$ over possible speech units $\omega$ for each frame $x_t$:

<p style="text-align:center;"><img src="http://drive.google.com/uc?export=view&id=153E-ailMiLPg3joPSx016lGv6S4vXVD2" height="300px" width="550px">

CTC is a discriminative state-space model defined as:
    
$$
    P(\omega_{1:L} | \mathbf{X}_{1: T}; \theta) = \sum_{\pi_{1:T} \in {A}(\omega_{1:L})} \prod_{t = 1}^T P(z_t = \pi_t| x_t; \theta)
$$
    
- CTC assumes all states conditionally independent
- Alignment free -- does not need prior alignment for training

## Homework: CTC Forward-Backward Algorithm (10 points)

- (4 points) Implement a Forward Algorithm
- (4 points) Implement a Backward  
- (2 points) Implement soft alignment

In [None]:
# Helper functions
BLANK_SYMBOL = "_"

class Tokenizer:
    """
    Maps characters to integers and vice versa
    """
    def __init__(self):
        self.char_map = {}
        self.index_map = {}
        for i, ch in enumerate(["'", " "] + list(string.ascii_lowercase) + [BLANK_SYMBOL]):
            self.char_map[ch] = i
            self.index_map[i] = ch

    def text_to_indices(self, text: str) -> List[int]:
        return [self.char_map[ch] for ch in text]

    def indices_to_text(self, labels: List[int]) -> str:
        return "".join([self.index_map[i] for i in labels])

    def get_symbol_index(self, sym: str) -> int:
        return self.char_map[sym]


tokenizer = Tokenizer()

NEG_INF = -float("inf")


def logsumexp(*args) -> float:
    """
    Log-sum-exp trick for log-domain calculations
    See for details: https://en.wikipedia.org/wiki/LogSumExp
    """
    if all(a == NEG_INF for a in args):
        return NEG_INF
    a_max = max(args)
    lsp = math.log(sum(math.exp(a - a_max) for a in args))
    return a_max + lsp


def modify_sequence(sequence: List[int], blank_idx: int) -> List[int]:
    """
    Modifies sequence which with START, END blanks and between each character
    """
    modified_sequence = []

    for idx in sequence:
        modified_sequence += [blank_idx, idx]

    modified_sequence.append(blank_idx)
    return modified_sequence

# Load test input and output data
matrix = np.loadtxt(os.path.join(week_04_asr_files, 'test_matrix.txt'))
labels_indices = tokenizer.text_to_indices('there se ms no good reason for believing that twillc ange')
ref_alphas = np.loadtxt(os.path.join(week_04_asr_files, 'alphas.txt'))
ref_betas = np.loadtxt(os.path.join(week_04_asr_files, 'betas.txt'))

### Definitions reminder
$X_{1:T}$ - input sequence (acoustic frames)  
$\omega_{1:L}$ - target sequence (text tokens)  
$\omega_{1:L}'$ - target sequence augmented with $\epsilon$ token  
$\pi_{1:T}$ - one of the alignment sequences for $\omega_{1:L}$

### Forward Algorithm (4 points)

$$
    \alpha_t(s) = P(\omega_{1:s/2}, \pi_t = \omega_s' | \mathbf{X}_{1:T}, \theta) = \sum_{\pi_{1:t - 1} \in {A}(\omega_{1:s/2}), \, \pi_t = \omega_s'}  P(\pi_{1:t} | \mathbf{X}_{1:T}, \theta)
$$

Note that despite the fact that we have moved to the extended sequence $\omega'$, we are still interested in maximizing the probability of alignments to the original sequence. And step $s$ in the new sequence corresponds to step $s/2$ in the old sequence (rounded to the bottom).

The CTC forward algorithm recursively computes the forward variable $\alpha_t(s)$.

<p style="text-align:center;"><img src="http://drive.google.com/uc?export=view&id=1QaW0mJ9c3Z0KJVk3pUSyC_kS_pFC_QxS" height="400px" width="600px">  


**Initialization.** We allow all prefixes to start with either a blank ($\epsilon$) or the first symbol in $\omega_{1:L}$. Also note that $\alpha_t(s) = 0,\ \forall s < (2L + 1) - 2(T - t) - 1$, because these variables correspond to states for which there are not enough time-steps left to complete the sequence.

This gives us the following rules for initialization:

$$
  \begin{aligned}
    &\alpha_t(0) = 0, \forall t & \\
    &\alpha_1(1) = P(z_1 = \epsilon | \mathbf{X}_{1:T}), &\\
    &\alpha_1(2) = P(z_1 = \omega^{'}_2 | \mathbf{X}_{1:T}), &\\
    &\alpha_1(s) = 0,\ \forall s > 2 &\\
    &\alpha_t(s) = 0,\ \forall s < (2L + 1) - 2(T - t) - 1 &  \text{top right zeros}\\
  \end{aligned}
$$

**Recursion.**

$$
  \begin{aligned}
    &\alpha_t(s) = \left \{
  \begin{aligned}
    &\big(\alpha_{t-1}(s) + \alpha_{t-1}(s-1) \big) P(z_t = \omega^{'}_s | \mathbf{X}_{1:T}) & \text{if}\ \omega_s^{'} = \epsilon\ \text{or}\
    \omega_s^{'} = \omega_{s-2}^{'} \\
    &\big(\alpha_{t-1}(s) + \alpha_{t-1}(s-1) + \alpha_{t-1}(s-2)\big) P(z_t = \omega^{'}_s | \mathbf{X}_{1:T}) & \text{otherwise}\\
  \end{aligned} \right.
  \end{aligned}
$$


<p style="text-align:center;"><img src="http://drive.google.com/uc?export=view&id=1Tre3oFHyjigpqG-GI1xVrOchZAMnRYBK" height="250px" width="650px">

Doing the computation in probability space can be numerically unstable, so you should do it in Log-Space using the
provided logsumexp operation.

In [None]:
def forward_algorithm(sequence: List[int], matrix: np.ndarray) -> np.ndarray:
    """
    :param sequence: a string converted to an index array by Tokenizer
    :param matrix: A matrix of shape (K, T) with probability distributions over phonemes at each moment of time.
    :return: the result of the forward pass of shape (2 * len(sequence) + 1, T)
    """
    # Turn probs into log-probs
    matrix = np.log(matrix)

    blank = tokenizer.get_symbol_index(BLANK_SYMBOL)
    mod_sequence = modify_sequence(sequence, blank)

    # Initialze
    alphas = np.full([len(mod_sequence), matrix.shape[1]], NEG_INF)

    for t in range(matrix.shape[1]):
        for s in range(len(mod_sequence)):
            # First Step
            if t == 0:
                ########################
                # YOUR CODE HERE
                ########################

            # Upper diagonal zeros
            elif # CONDITION
                ########################
                # YOUR CODE HERE
                ########################
            else:
                # Need to do this stabily
                if s == 0:
                    ########################
                    # YOUR CODE HERE
                    ########################
                elif s == 1:
                    ########################
                    # YOUR CODE HERE
                    ########################
                else:
                    ########################
                    # YOUR CODE HERE HINT - THERE IS ANOTHER IFELSE
                    ########################
    return alphas

Let's test the forward algorithm.  
**These asserts are just helpers to discover a bug, and not real blockers. So if your code successfully runs the last assert from the soft alignment task you can ignore this batch of tests.**

In [None]:
alphas = forward_algorithm(labels_indices, matrix)

incorrect_elements = np.nonzero(~np.isclose(ref_alphas, alphas))
if incorrect_elements[1].shape[0]:
    index = np.argmin(incorrect_elements[1])
    incorrect_indices = (incorrect_elements[0][index], incorrect_elements[1][index])
    print((
        f'Leftmost incorrect time layer is t={incorrect_elements[1][index]}\n'
        f'Your alphas[{incorrect_indices[0]}, {incorrect_indices[1]}] = {alphas[incorrect_indices]:.5f}\t'
        f'Reference alphas[{incorrect_indices[0]}, {incorrect_indices[1]}] = {ref_alphas[incorrect_indices]:.5f}'
    ))

assert np.allclose(ref_alphas[:, 0], alphas[:, 0]), "Bad initialization of the first layer of alphas (t = 0)"
assert np.allclose(ref_alphas[0, :], alphas[0, :]), "Bad calculation of the first elemenent (probability of all-blank sequence '<blank>...<blank>') of the each layer"
assert np.allclose(ref_alphas[1, :], alphas[1, :]), "Bad calculation of the second elemenent (probability of sequence '<blank>...<blank><first_token>') of the each layer"
assert np.allclose(ref_alphas[::2, :], alphas[::2, :]), "Bad calculation of alphas for the blank tokens"
assert np.allclose(ref_alphas, alphas), "Your alphas matrix is not close enough to the reference one"

### Backward algorithm (4 points)

Define $\beta_t(s)$ as the probability of all valid alignments $\omega'_{s:L}$ starting in state $\omega_s'$:

$$
    \beta_t(s) = P(\omega_{s/2:L}, \pi_t = \omega'_s | \mathbf{X}_{1:T}, \theta) = \sum_{\pi_{t + 1:T} \in \mathcal{A}(\omega_{s/2:L}), \, \pi_t = \omega_s'} P(\pi_{t + 1:T} | \mathbf{X}_{1:T}, \theta)
$$

The CTC backward algorithm recursively computes the backward variable $\beta_t(s)$:

<p style="text-align:center;"><img src="http://drive.google.com/uc?export=view&id=11x3TGAzL2LWfO0ZKpPHegOvv8Iw6ZC0X" height="400px" width="600px">



The formulas for backward algorithm are as follows:

$$
  \begin{aligned}
    &\beta_T(2L+1) = 1 &\\
    &\beta_T(2L) = 1 & \\
    &\beta_T(s) = 0, \forall s < 2L &\\
    &\beta_t(s) = 0,\ \forall s > 2t &\\
    &\beta_t(2L+2) = 0,\ \forall t  & \text{bottom left zeros} \\
    &\beta_t(s) = \left \{
      \begin{aligned}
        &\beta_{t+1}(s) P(z_{t + 1} = \omega^{'}_s | \mathbf{X}_{1:T}) + \beta_{t+1}(s+1) P(z_{t + 1} = \omega^{'}_{s + 1} | \mathbf{X}_{1:T})  & \text{if}\ \omega_s^{'} = \epsilon\ \text{or}\
        \omega_s^{'} = \omega_{s+2}^{'} \\
        &\beta_{t+1}(s) P(z_{t + 1} = \omega^{'}_s | \mathbf{X}_{1:T}) + \beta_{t+1}(s+1) P(z_{t + 1} = \omega^{'}_{s + 1} | \mathbf{X}_{1:T}) + \beta_{t+1}(s+2) P(z_{t + 1} = \omega^{'}_{s + 2} | \mathbf{X}_{1:T}) & \text{otherwise}\\
      \end{aligned}\right.
  \end{aligned}
$$

<p style="text-align:center;"><img src="http://drive.google.com/uc?export=view&id=1h7OBZZ02dwZ1mDhRYh7yTy7-UW4NmbXm" height="250px" width="650px">

Doing the computation in probability space can be numerically unstable, so you should do it in Log-Space using the
provided logsumexp operation.

In [None]:
def backward_algorithm(sequence: List[int], matrix: np.ndarray) -> np.ndarray:
    """
    :param sequence: a string converted to an index array by Tokenizer
    :param matrix: A matrix of shape (K, T) with probability distributions over phonemes at each moment of time.
    :return: the result of the backward pass of shape (2 * len(sequence) + 1, T)
    """
    matrix = np.log(matrix)
    blank = tokenizer.get_symbol_index(BLANK_SYMBOL)
    mod_sequence = modify_sequence(sequence, blank)
    betas = np.full([len(mod_sequence), matrix.shape[1]], NEG_INF)

    for t in reversed(range(matrix.shape[1])):
        for s in reversed(range(len(mod_sequence))):
            # First Step
            if t == matrix.shape[1] - 1:
                ########################
                # YOUR CODE HERE
                ########################

            # Lower Diagonal Zeros
            elif # CONDITION
                ########################
                # YOUR CODE HERE
                ########################
            else:
                if s == len(mod_sequence) - 1:
                ########################
                # YOUR CODE HERE
                ########################
                elif s == len(mod_sequence) - 2:
                ########################
                # YOUR CODE HERE
                ########################
                else:
                    if mod_sequence[s] == blank or mod_sequence[s] == mod_sequence[s + 2]:
                        ########################
                        # YOUR CODE HERE
                        ########################
                    else:
                        ########################
                        # YOUR CODE HERE
                        ########################
    return betas

Let's test the backward algorithm.  
**These asserts are just helpers to discover a bug, and not real blockers. So if your code successfully runs the last assert from the soft alignment task you can ignore this batch of tests.**

In [None]:
betas = backward_algorithm(labels_indices, matrix)

incorrect_elements = np.nonzero(~np.isclose(ref_betas, betas))
if incorrect_elements[1].shape[0]:
    index = np.argmax(incorrect_elements[1])
    incorrect_indices = (incorrect_elements[0][index], incorrect_elements[1][index])
    print((
        f'Rightmost incorrect time layer is t={incorrect_elements[1][index]}\n'
        f'Your betas[{incorrect_indices[0]}, {incorrect_indices[1]}] = {betas[incorrect_indices]:.5f}\t'
        f'Reference betas[{incorrect_indices[0]}, {incorrect_indices[1]}] = {ref_betas[incorrect_indices]:.5f}'
    ))

assert np.allclose(ref_betas[:, -1], betas[:, -1]), "Bad initialization of the last layer of betas (t = matrix.shape[1] - 1)"
assert np.allclose(ref_betas[-1, :], betas[-1, :]), "Bad calculation of the last elemenent (probability of all-blank sequence '<blank>...<blank>') of the each layer"
assert np.allclose(ref_betas[-2, :], betas[-2, :]), "Bad calculation of the second-to-last elemenent (probability for the sequence '<last_token><blank>...<blank>') of the each layer"
assert np.allclose(ref_betas[::2, :], betas[::2, :]), "Bad calculation of betas for the blank tokens"
assert np.allclose(ref_betas, betas), "Your betas matrix is not close enough to the reference one"

### Alignment and loss computation (2 points)

Use your newfound knowledge of the CTC forward-backward algorithm to obtain a soft-alignment

Remember, that the forward variable is computed as follows:

The probability of all paths passing through a state $\pi_t = \omega_s'$ is the product of forward and backward variables:

$$
    \alpha_t(s) \beta_t(s) = \sum_{\pi_{1:T} \in \mathcal{A}(\omega_{1:L}), \,\pi_t=\omega_s'} P(\pi_{1:T} | \mathbf{X}_{1:T}, \theta)
$$

Then, for any $t$, sum of all such products yields total probability:

$$
     \sum_{s = 1}^{2 L + 1} \alpha_t(s) \beta_t(s) = P(\omega_{1:L} | \mathbf{X}_{1:T}, \theta)
$$

We can also use normalized $\alpha_t(s) \beta_t(s)$ as a measure of **soft-alignment**:

$$
    \text{align}_t(s) = \frac{\alpha_t(s) \beta_t(s)}{\sum_{s = 1}^{2 L + 1} \alpha_t(s) \beta_t(s)}
$$

You should get something like

<p style="text-align:center;"><img src="http://drive.google.com/uc?export=view&id=1HAIl9UPReiFQ7dNOZFGfvUWDurFDBZYM" height="300px" width="800px">

$$
  \text{align}_t(s) = \frac{\alpha_t(s)\beta_t(s)}{\sum_{s}\alpha_t(s)\beta_t(s)}
$$

In [None]:
def soft_alignment(labels_indices: List[int], matrix: np.ndarray) -> np.ndarray:
    """
    Returns the alignment coefficients for the input sequence
    """
    alphas = forward_algorithm(labels_indices, matrix)
    betas = backward_algorithm(labels_indices, matrix)

    # Move from log space back to prob space
    align = # YOUR CODE

    # Normalize Alignment
    align = # YOUR CODE

    return align

In [None]:
matrix = np.loadtxt(os.path.join(week_04_asr_files, 'test_matrix.txt'))

labels_indices = tokenizer.text_to_indices('there se ms no good reason for believing that twillc ange')

align = soft_alignment(labels_indices, matrix)
f, ax = plt.subplots(1, 2, dpi=75, figsize=(15, 5))

im = ax[0].imshow(align, aspect='auto', interpolation='nearest')
ax[0].set_title("Alignment")
ax[0].set_ylabel("Phonemes")
ax[0].set_xlabel("Time")
f.colorbar(im, ax=ax[0])

im = ax[1].imshow(np.log(align), aspect='auto', interpolation='nearest')
ax[1].set_title("Alignment in log scale")
ax[1].set_ylabel("Phonemes")
ax[1].set_xlabel("Time")
f.colorbar(im, ax=ax[1])

plt.tight_layout()

ref_align = np.loadtxt(os.path.join(week_04_asr_files, 'soft_alignment.txt'))
assert np.allclose(ref_align, align)