In [155]:
import os
cwd_old = os.getcwd()
os.chdir('..')
cwd_new = os.getcwd()
print("The working directory is moved from {} to {}.".format(cwd_old, cwd_new))

The working directory is moved from /data2/sungjaecho/Projects/tacotron2/dev_ipynb to /data2/sungjaecho/Projects/tacotron2.


# 1. Preliminary

All measures for monotonicity of attention take `alignments`.

`alignments`
- type: `torch.Tensor`
- size: `[batch_size, mel_steps, txt_steps]`

I have come up with 3 measures.
- `forward_attention_ratio`: To measure monotonic increments over all two contiguous decoding steps.
- `attention_ratio`: To measure how much encoding steps are attended over all encoding steps.
- `multiple_attention_ratio`: To measure how much encoding steps are attended multiple times over all encoding steps.

In [156]:
import torch

def get_mel_length(gate_output):
    '''
    Prams
    -----
    gate_output: torch.Tensor.
    - size: [max_mel_len].

    Return
    -----
    mel_length: int.
    - Size == [batch_size].
    '''
    #mel_length = torch.max(torch.argmax(alignments[batch_i,:,text_length-last_steps:text_length],dim=0))
    #mel_length = mel_length.item()
    is_positive_output = (gate_output > 0).tolist()
    if True in is_positive_output:
        mel_length = is_positive_output.index(True)
    else:
        mel_length = len(is_positive_output)

    return mel_length

# 2. Measures

## 2.1. `forward_attention_ratio`

The function for this measure has already been developed.

## 2.2. `attention_ratio`

### 2.2.1. Development Step

In [74]:
import torch

In [142]:
batch_size, mel_steps, txt_steps = 32, 80, 60
alignments = torch.rand([batch_size, mel_steps, txt_steps])
text_lengths = torch.randint(10, 60, [batch_size])
mel_lengths = torch.randint(10, 80, [batch_size])
print("alignments.size():", alignments.size())
print("text_lengths.size():", text_lengths.size())

alignments.size(): torch.Size([32, 80, 60])
text_lengths.size(): torch.Size([32])


In [9]:
batch_size = alignments.size(0)
batch_attention_ratio = torch.empty((batch_size), dtype=torch.float)
for i in range(batch_size):
    text_length = text_lengths[i].item()
    mel_length = mel_lengths[i]
    alignment = alignments[i,:mel_length,:text_length]
    argmax_alignment = torch.argmax(alignment, dim=1)
    n_unique_argmax = torch.unique(argmax_alignment).size(0)
    sample_attention_ratio = n_unique_argmax / text_length
    batch_attention_ratio[i] = sample_attention_ratio
mean_attention_ratio = batch_attention_ratio.mean().item()
print(mean_attention_ratio)
print(batch_attention_ratio)

0.6319586038589478
tensor([0.6744, 0.9697, 0.2549, 0.7692, 0.6250, 0.5800, 0.3913, 0.6957, 0.8947,
        0.8333, 0.2909, 0.8684, 0.5000, 0.8222, 0.3137, 0.9091, 0.6429, 0.8500,
        0.5536, 0.6364, 0.3111, 0.9545, 0.3333, 0.6000, 0.2414, 0.7736, 0.6857,
        0.3889, 0.9259, 0.9118, 0.5862, 0.4348])


### 2.2.2. Functionalization Step

In [163]:
def attention_ratio(alignments, text_lengths, gate_outputs):
    '''
    Attention ratio is a measure for 
    "how much encoding steps are attended over all encoding steps".
    
    Params
    -----
    alignments: Attention map. torch.Tensor. Shape: [batch_size, mel_steps, txt_steps].
    text_lengths: torch.Tensor. A 1-D tensor that keeps input text lengths.
    gate_outputs: torch.Tensor. Shape: [batch_size, stop_token_seq].
    - A 2-D tensor that is a predicted sequence of the stopping decoding step
    - 0 indicates a signal to generate the next decoding step.
    - 1 indicates a signal to generate this decoding step and stop generating the next stop.
    
    Returns
    -----
    mean_attention_ratio
    - float. torch.mean(batch_forward_attention_ratio).
    batch_attention_ratio
    - torch.Tensor((batch_size),dtype=torch.float).
    '''
    batch_size = alignments.size(0)
    batch_attention_ratio = torch.empty((batch_size), dtype=torch.float)
    sum_attention_ratio = 0
    for i in range(batch_size):
        text_length = text_lengths[i].item()
        gate_output = gate_outputs[i]
        mel_length = get_mel_length(gate_output)
        alignment = alignments[i,:mel_length,:text_length]
        argmax_alignment = torch.argmax(alignment, dim=1)
        n_unique_argmax = torch.unique(argmax_alignment).size(0)
        sample_attention_ratio = n_unique_argmax / text_length
        batch_attention_ratio[i] = sample_attention_ratio
    mean_attention_ratio = batch_attention_ratio.mean().item()
    
    return mean_attention_ratio, batch_attention_ratio

# 2.3. `multiple_attention_ratio`

### 2.3.1. Development Step

In [74]:
import torch

In [201]:
batch_size, mel_steps, txt_steps = 32, 80, 60
alignments = torch.randint(0, 10, [batch_size, mel_steps, txt_steps])
text_lengths = torch.randint(10, 60, [batch_size])
mel_lengths = torch.randint(10, 80, [batch_size])
print("alignments.size():", alignments.size())
print("text_lengths.size():", text_lengths.size())

alignments.size(): torch.Size([32, 80, 60])
text_lengths.size(): torch.Size([32])


In [202]:
batch_size = alignments.size(0)
batch_multiple_attention_ratio = torch.empty((batch_size), dtype=torch.float)

for i in range(batch_size):
    text_length = text_lengths[i].item()
    mel_length = mel_lengths[i].item()
    alignment = alignments[i,:mel_length,:text_length]
    argmax_alignment = torch.argmax(alignment, dim=1)
    argmax_alignment = argmax_alignment.tolist()
    
    for j in range((mel_length-2), -1, -1):
        j_prev = j + 1
        if argmax_alignment[j] == argmax_alignment[j_prev]:
            del argmax_alignment[j_prev]
        
    n_multiple_attention = 0
    for argmax in set(argmax_alignment):
        if argmax_alignment.count(argmax) > 1:
            n_multiple_attention += 1
    sample_multiple_attention_ratio = n_multiple_attention / text_length
    batch_multiple_attention_ratio[i] = sample_multiple_attention_ratio

mean_multiple_attention_ratio = batch_multiple_attention_ratio.mean().item()
print(mean_multiple_attention_ratio)
print(batch_multiple_attention_ratio)

0.3225729167461395
tensor([0.2727, 0.0339, 0.3571, 0.2105, 0.1020, 0.1667, 0.1944, 0.3846, 0.7692,
        0.2586, 0.2037, 0.3438, 0.7778, 0.2889, 0.0513, 0.1087, 0.6522, 0.5000,
        0.2083, 0.2917, 0.3000, 0.3243, 0.3611, 0.2449, 0.3077, 0.1034, 0.2745,
        0.5000, 0.8824, 0.1750, 0.2857, 0.3871])


### 2.3.2. Functionalization Step

In [204]:
def multiple_attention_ratio(alignments, text_lengths, gate_outputs):
    '''
    Multiple attention ratio is a measure for 
    "how much encoding steps are attended multiple times over all encoding steps".
    
    Params
    -----
    alignments: Attention map. torch.Tensor. Shape: [batch_size, mel_steps, txt_steps].
    text_lengths: torch.Tensor. A 1-D tensor that keeps input text lengths.
    gate_outputs: torch.Tensor. Shape: [batch_size, stop_token_seq].
    - A 2-D tensor that is a predicted sequence of the stopping decoding step
    - 0 indicates a signal to generate the next decoding step.
    - 1 indicates a signal to generate this decoding step and stop generating the next stop.
    
    Returns
    -----
    mean_multiple_attention_ratio
    - float. torch.mean(batch_forward_attention_ratio).
    batch_multiple_attention_ratio
    - torch.Tensor((batch_size),dtype=torch.float).
    '''
    batch_size = alignments.size(0)
    batch_multiple_attention_ratio = torch.empty((batch_size), dtype=torch.float)

    for i in range(batch_size):
        text_length = text_lengths[i].item()
        gate_output = gate_outputs[i]
        mel_length = get_mel_length(gate_output)
        alignment = alignments[i,:mel_length,:text_length]
        argmax_alignment = torch.argmax(alignment, dim=1)
        argmax_alignment = argmax_alignment.tolist()

        for j in range((mel_length-2), -1, -1):
            j_prev = j + 1
            if argmax_alignment[j] == argmax_alignment[j_prev]:
                del argmax_alignment[j_prev]

        n_multiple_attention = 0
        for argmax in set(argmax_alignment):
            if argmax_alignment.count(argmax) > 1:
                n_multiple_attention += 1
        sample_multiple_attention_ratio = n_multiple_attention / text_length
        batch_multiple_attention_ratio[i] = sample_multiple_attention_ratio

    mean_multiple_attention_ratio = batch_multiple_attention_ratio.mean().item()
    
    return mean_multiple_attention_ratio, batch_multiple_attention_ratio

## 2.4. `attention_range_ratio`

### 2.4.1. Development Step

In [1]:
import torch

In [2]:
batch_size, mel_steps, txt_steps = 32, 80, 60
alignments = torch.rand([batch_size, mel_steps, txt_steps])
text_lengths = torch.randint(10, 60, [batch_size])
mel_lengths = torch.randint(10, 80, [batch_size])
print("alignments.size():", alignments.size())
print("text_lengths.size():", text_lengths.size())

alignments.size(): torch.Size([32, 80, 60])
text_lengths.size(): torch.Size([32])


In [8]:
batch_size = alignments.size(0)
batch_attention_range_ratio = torch.empty((batch_size), dtype=torch.float)
for i in range(batch_size):
    text_length = text_lengths[i].item()
    mel_length = mel_lengths[i]
    alignment = alignments[i,:mel_length,:text_length]
    argmax_alignment = torch.argmax(alignment, dim=1)
    unique_argmax_set = torch.unique(argmax_alignment)
    range_length = torch.max(unique_argmax_set) - torch.min(unique_argmax_set) + 1
    range_length = range_length.item()
    range_ratio = range_length / text_length
    batch_attention_range_ratio[i] = range_ratio
mean_attention_range_ratio = batch_attention_range_ratio.mean().item()
print(mean_attention_range_ratio)
print(batch_attention_range_ratio)

0.9682873487472534
tensor([1.0000, 1.0000, 0.7647, 1.0000, 0.9750, 1.0000, 1.0000, 0.9565, 1.0000,
        1.0000, 0.9818, 1.0000, 0.9800, 0.9778, 0.9412, 1.0000, 0.9286, 0.9500,
        1.0000, 0.9545, 1.0000, 1.0000, 0.9048, 0.8667, 0.8966, 1.0000, 0.9429,
        0.9815, 1.0000, 1.0000, 0.9828, 1.0000])


### 2.4.2. Functionalization Step

In [10]:
def attention_range_ratio(alignments, text_lengths, gate_outputs):
    '''
    Attention ratio is a measure for 
    "how much encoding steps are attended over all encoding steps".
    
    Params
    -----
    alignments: Attention map. torch.Tensor. Shape: [batch_size, mel_steps, txt_steps].
    text_lengths: torch.Tensor. A 1-D tensor that keeps input text lengths.
    gate_outputs: torch.Tensor. Shape: [batch_size, stop_token_seq].
    - A 2-D tensor that is a predicted sequence of the stopping decoding step
    - 0 indicates a signal to generate the next decoding step.
    - 1 indicates a signal to generate this decoding step and stop generating the next stop.
    
    Returns
    -----
    mean_attention_ratio
    - float. torch.mean(batch_forward_attention_ratio).
    batch_attention_ratio
    - torch.Tensor((batch_size),dtype=torch.float).
    '''
    batch_size = alignments.size(0)
    batch_attention_range_ratio = torch.empty((batch_size), dtype=torch.float)
    for i in range(batch_size):
        text_length = text_lengths[i].item()
        mel_length = mel_lengths[i]
        alignment = alignments[i,:mel_length,:text_length]
        argmax_alignment = torch.argmax(alignment, dim=1)
        unique_argmax_set = torch.unique(argmax_alignment)
        range_length = torch.max(unique_argmax_set) - torch.min(unique_argmax_set) + 1
        range_length = range_length.item()
        range_ratio = range_length / text_length
        batch_attention_range_ratio[i] = range_ratio
    mean_attention_range_ratio = batch_attention_range_ratio.mean().item()
    
    return mean_attention_range_ratio, batch_attention_range_ratio