# Evaluation

## Preliminaries

### Imports

In [1]:
import sys
import os
sys.path.append('./src')

In [2]:
import pickle
import bz2

In [3]:
from joblib import Parallel
from joblib import delayed
from joblib import parallel_backend

In [4]:
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [5]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

In [6]:
import pandas as pd

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import (
    homogeneity_score,
    completeness_score,
    v_measure_score,
    silhouette_score,
    calinski_harabasz_score,
)

In [8]:
import numpy as np

In [9]:
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
import random

### Constants and global variables

In [11]:
SPLITS = ('train', 'validation', 'test')
MODELS = ('Pretrained', 'Fine-Tuned')
DATA_SETS = ('DailyDialog', 'EmpatheticDialogues', 'Persona-Chat', 'Wizard of Wikipedia', 'HOPE')
DISTRIBUTIONS = ('latent_prior_dist', 'latent_posterior_dist')
HIDDEN_VECTORS = ('prior_hidden_state', 'posterior_hidden_state')

In [12]:
DATA_PATH = '../resources/data/cache/'
PROCESSED_DATA_PATHS = (
    '../experiments/DLDLM_evaluation/dldlm_large_nllt_lda_lr_2022_11_10_22_41_08/data', 
    '../experiments/DLDLM_evaluation/dldlm_large_nllt_gibbs_2022_11_11_09_54_42/data'
)

In [13]:
N_LATENTS = 16

In [14]:
N_REPETITIONS = 10

In [15]:
DISTRIBUTION_MAP = {'latent_prior_dist': 'Prior', 'latent_posterior_dist': 'Posterior'}
DIALOGUE_ACT_MAP = {
    'Clarification Request': 'Clarification Request',
    'Opinion Request': 'Opinion Request',
    'Information Request': 'Information Request',
    'Yes/No question': 'Yes/No Question',
    'Greeting': 'Greeting',
    'Acknowledgement': 'Acknowledgement',
    'General Chat': 'General Chat',
    'Information Delivery': 'Information Delivery',
    'Positive Answer': 'Positive Answer',
    'Clarification Delivery': 'Clarification Delivery',
    'Negative Answer': 'Negative Answer',
    'Opinion Delivery': 'Opinion Delivery'
}
DIALOGUE_ACT_TYPE_MAP = {
    'Clarification Request': 'Speaker Initiative',
    'Opinion Request': 'Speaker Initiative',
    'Information Request': 'Speaker Initiative',
    'Yes/No question': 'Speaker Initiative',
    'Greeting': 'General',
    'Acknowledgement': 'General',
    'General Chat': 'General',
    'Information Delivery': 'Speaker Responsive',
    'Positive Answer': 'Speaker Responsive',
    'Clarification Delivery': 'Speaker Responsive',
    'Negative Answer': 'Speaker Responsive',
    'Opinion Delivery': 'Speaker Responsive'
}
SENTIMENT_MAP = {'Very negative': 'Very Negative', 'Negative': 'Negative', 'Neutral': 'Neutral', 'Positive': 'Positive', 'Very Positive': 'Very Positive'}
SPEAKER_MAP = {'Therapist': 'Therapist', 'Patient': 'Patient'}

In [16]:
TOP_WORDS = 20

In [17]:
DIALOGUE_ACT_ENCODER = {value: i for i, value in enumerate(DIALOGUE_ACT_MAP.values())}
DIALOGUE_ACT_TYPE_ENCODER = {value: i for i, value in enumerate(DIALOGUE_ACT_TYPE_MAP.values())}
SENTIMENT_ENCODER = {value: i for i, value in enumerate(SENTIMENT_MAP.values())}
SPEAKER_ENCODER = {value: i for i, value in enumerate(SPEAKER_MAP.values())}

In [18]:
RANDOM_SEED = 2307

### Random seed

In [19]:
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7fda726666f0>

### Helper functions

In [20]:
def load_data(path):
    # Open the BZIP2 file
    with bz2.BZ2File(path, 'r') as f:
        # Read the content with Pickle
        data = pickle.load(f)
    # Return loaded data
    return data

In [21]:
def group_by(data, group):
    # Initialise a dictionary to store the groups
    grouped_data = {value: [] for value in {sample[group] for sample in data}}
    # Go through the data set and add each sample to the correct group
    for sample in data:
        if sample[group] in grouped_data:
            grouped_data[sample[group]].append(sample)
        else:
            grouped_data[sample[group]]= [sample]
    # Return grouped data
    return grouped_data

In [22]:
def compute_metadata(data):
    # Helper function to compute the desired info on a single sample
    def get_metadata(sample):
        return {'corpus': sample['corpus'], 'conversation_idx': sample['conversation_idx'], 'n_tokens': len(word_tokenize(sample['response']))}
    # Define parallel backend to compute the desired metadata
    with parallel_backend('threading', n_jobs=-1):
        # Compute No. of tokens for each turn of each dialogue in each corpus (mantaing the splitting info)
        metadata = Parallel(verbose=0)(delayed(get_metadata)(sample) for sample in data)
    # Return computed metadata
    return metadata

In [23]:
def compute_stats(data):
    # Group data by dialogue identifier
    grouped_data = group_by(data, 'conversation_idx')
    # Compute number of different dialogues
    n_dialogues = len(grouped_data)
    # Compute mean and standard deviation of dialogues length
    mean_n_turns = mean(len(group) for group in grouped_data.values())
    std_n_turns = stdev(len(group) for group in grouped_data.values())
    # Compute mean and standard deviation of tokens per turn length
    mean_n_tokens = mean(sample['n_tokens'] for sample in data)
    std_n_tokens = stdev(sample['n_tokens'] for sample in data)
    # Return computed statistics
    return n_dialogues, (mean_n_turns, std_n_turns), (mean_n_tokens, std_n_tokens)

In [24]:
def is_same_sample(sample_a, sample_b):
    return sample_a['corpus'] == sample_b['corpus'] and sample_a['conversation_idx'] == sample_b['conversation_idx'] and sample_a['turn_idx'] == sample_b['turn_idx']

## Data

### Load data

In [25]:
data = {
    split: load_data(os.path.join(DATA_PATH, f'evaluation_corpus_{split}.pbz2')) for split in SPLITS
}

In [27]:
processed_data = {
    model: {split: load_data(os.path.join(data_path, f'evaluation_output_{split}.pbz2')) for split in SPLITS} 
    for model, data_path in zip(MODELS, PROCESSED_DATA_PATHS)
}

In [28]:
assert all(
    is_same_sample(s1, s2) and is_same_sample(s2, s3) 
    for s1, s2, s3 in zip(
        data['validation'], 
        processed_data['Pretrained']['validation'], 
        processed_data['Fine-Tuned']['validation']
    )
)
assert all(
    is_same_sample(s1, s2) and is_same_sample(s2, s3) 
    for s1, s2, s3 in zip(
        data['test'], 
        processed_data['Pretrained']['test'], 
        processed_data['Fine-Tuned']['test']
    )
)

### Preprocessing

In [29]:
tf = CountVectorizer(
    max_df=0.95, 
    min_df=2, 
    stop_words='english'
).fit([sample['response'] for sample in data['train'] if sample['corpus'] != 'HOPE'])

In [30]:
tfidf = TfidfVectorizer(
    # tokenizer=word_tokenize, 
    # stop_words=set(stopwords.words('english')) | set(string.punctuation), 
    max_df=0.9, 
    min_df=2,
    stop_words='english'
).fit([sample['response'] for sample in data['train']])

In [33]:
lda = LatentDirichletAllocation(
    n_components=N_LATENTS,
    learning_method='online',
    learning_offset=50.0,
    random_state=RANDOM_SEED,
).fit(tf.transform([sample['response'] for sample in data['train'] if sample['corpus'] != 'HOPE']))

## Latent model analysis

### Entire data set

In [34]:
eval_data = {
    split: data[split] for split in data if split != 'train'
}

#### Clusters evaluation

We sample the latent ID $n$ from the prior and posterior distributions and we compute the usual clustering metrics, then we average the results

In [35]:
X = {split: tfidf.transform([sample['response'] for sample in eval_data[split]]) for split in eval_data}

In [36]:
# H = {
#     split: {
#             model: {
#             DISTRIBUTION_MAP[distribution]: np.array([processed_data[model][split][hidden_state]])
#             for distribution, hidden_state in zip(DISTRIBUTIONS, HIDDEN_VECTORS)
#         } 
#         for model in MODELS
#     }
#     for split in eval_data
# }

In [37]:
log_p = {
    split: {
            model: {
            DISTRIBUTION_MAP[distribution]: torch.tensor(
                [sample[distribution] for sample in processed_data[model][split]]
            ).log() 
            for distribution in DISTRIBUTIONS
        } 
        for model in MODELS
    }
    for split in eval_data
}

In [38]:
log_p_lda = {
    split: {
        'LDA': {
            'Prior': torch.tensor([[1 / N_LATENTS] * N_LATENTS] * len(eval_data[split])).log(),
            'Posterior': torch.tensor(lda.transform(tf.transform([sample['response'] for sample in eval_data[split]]))).log()
        } 
    }
    for split in eval_data
}

In [39]:
temperatures = [0.0, 0.333, 0.667, 0.9, 1.0, 1.5, 2.0]

In [40]:
values = []

In [41]:
intrinsic_metrics = {'Silhouette': silhouette_score, 'Calinski-Harabasz index': calinski_harabasz_score}

In [42]:
sampled_latents = []

In [None]:
for t in temperatures:
    for k in range(N_REPETITIONS):
        # Sample latent
        c = {
            split: {
                model: {
                    distribution: (
                        torch.multinomial(torch.softmax(log_probs / t, dim=-1), 1).squeeze().numpy()
                        if t > 0 else torch.argmax(log_probs, dim=-1).squeeze().numpy()
                    )
                    for distribution, log_probs in distribution_dict.items()
                } 
                for model, distribution_dict in model_dict.items()
            } 
            for split, model_dict in log_p.items()
        }
        # Add sampling with LDA to have a term of comparison
        c_lda = {
            split: {
                model: {
                    distribution: (
                        torch.multinomial(torch.softmax(log_probs / t, dim=-1), 1).squeeze().numpy()
                        if t > 0 else torch.argmax(log_probs, dim=-1).squeeze().numpy()
                    )
                    for distribution, log_probs in distribution_dict.items()
                } 
                for model, distribution_dict in model_dict.items()
            } 
            for split, model_dict in log_p_lda.items()
        }
        # Store sampled latents
        for split in c:
            for model in c[split]:
                for distribution in c[split][model]:
                    for latent, count in zip(*np.unique(c[split][model][distribution], return_counts=True)):
                        sampled_latents.append((t, k, model, distribution, latent, count, split.capitalize()))
        # Intrinsic metrics
        for metric, f in intrinsic_metrics.items():
            for split in ['validation', 'test']:
                for model, model_dict in c[split].items():
                    for distribution, latent in model_dict.items():
                        values.append((t, k, metric, f(X[split].toarray(), latent), model, distribution, None, 'TF-IDF', split))
                        # values.append((t, k, metric, f(H[split][model][distribution], latent), model, distribution, None, 'Last hidden state', split))
                for model, model_dict in c_lda[split].items():
                    for distribution, latent in model_dict.items():
                        values.append((t, k, metric, f(X[split].toarray(), latent), model, distribution, None, 'TF-IDF', split))
                        # values.append((t, k, metric, f(H[split][model][distribution], latent), model, distribution, None, 'Last hidden state', split))

In [None]:
df_eval = pd.DataFrame(values, columns=['Sampling temperature', 'Fold', 'Metric', 'Score', 'Model', 'Distribution', 'Label', 'Vector','Split'])

In [None]:
sample_df_eval = pd.DataFrame(sampled_latents, columns=['Sampling temperature', 'Fold', 'Model', 'Distribution', 'Latent', 'Counts', 'Split'])

Plot the averaged distributions

In [None]:
for split in sample_df_eval['Split'].unique():
    tmp_df = sample_df_eval[sample_df_eval['Split'] == split]
    
    fig, axes = plt.subplots(
        nrows=len(tmp_df['Distribution'].unique()), 
        ncols=len(tmp_df['Sampling temperature'].unique()), 
        figsize=(18, 6), 
        sharex=True, 
        sharey=True
    )

    for i, (distribution, distribution_df) in enumerate(tmp_df.groupby('Distribution', sort=False)):
        for j, (t, temperature_df) in enumerate(distribution_df.groupby('Sampling temperature', sort=False)):
            sns.barplot(data=temperature_df, hue='Model', x='Latent', y='Counts', ax=axes[i][j], linewidth=1., edgecolor='0', order=[*range(N_LATENTS)])
            axes[i][j].set_title(f'Split: {split},\nDistribution: {distribution},\nSampling temperature: {t:.3f}')
            # axes[i][j].set_xlabel('Latent')
            # axes[i][j].set_ylabel('Counts')
            # axes[0][i].set_xlim(0.9, tmp_df_coarse[0].max() + 1000)
    plt.tight_layout()
    plt.show()

#### Intrinsic evaluation

In [None]:
tmp_df = df_eval[df_eval['Metric'].isin(intrinsic_metrics)]

In [None]:
sns.relplot(
    data=tmp_df[tmp_df['Split'] == 'validation'], 
    x='Sampling temperature', 
    y='Score', 
    style='Distribution', 
    row='Vector',
    col='Metric', 
    hue='Model', 
    kind='line'
)

In [None]:
sns.relplot(
    data=tmp_df[tmp_df['Split'] == 'test'], 
    x='Sampling temperature', 
    y='Score', 
    style='Distribution', 
    row='Vector',
    col='Metric',  
    hue='Model', 
    kind='line'
)

### Psychotherapy

We focus the anlysis on the labels for the HOPE corpus, for multiple reasons:
- It is the target data set
- We have labels so we can appl both an extrinsic and intrinsic evaluation
- Its size allows for repeated sampling validation

In [None]:
hope = {
    split: [
        sample 
        for sample in data[split] 
        if sample['corpus'] == 'HOPE' and sample['speaker'] is not None and sample['dialogue_act'] is not None
    ]
    for split in data
    if split != 'train'
}

#### Clusters evaluation

We sample the latent ID $n$ from the prior and posterior distributions and we compute the usual clustering metrics, then we average the results

In [None]:
X = {split: tfidf.transform([sample['response'] for sample in hope[split]]) for split in hope}

In [None]:
y = {
    split: {
        'Dialogue act': np.array([DIALOGUE_ACT_ENCODER[DIALOGUE_ACT_MAP[sample['dialogue_act']]] for sample in hope[split]]), 
        'Dialogue act type': np.array([DIALOGUE_ACT_TYPE_ENCODER[DIALOGUE_ACT_TYPE_MAP[sample['dialogue_act']]] for sample in hope[split]]), 
        'Speaker': np.array([SPEAKER_ENCODER[SPEAKER_MAP[sample['speaker']]] for sample in hope[split]]), 
        'Sentiment': np.array([SENTIMENT_ENCODER[SENTIMENT_MAP[sample['sentiment']]] for sample in hope[split]])
    } for split in hope
}

In [None]:
# H = {
#     split: {
#             model: {
#             DISTRIBUTION_MAP[distribution]: np.array([processed_data[model][split][hidden_state]])
#             for distribution, hidden_state in zip(DISTRIBUTIONS, HIDDEN_VECTORS)
#         } 
#         for model in MODELS
#     }
#     for split in y
# }

In [None]:
log_p = {
    split: {
            model: {
            DISTRIBUTION_MAP[distribution]: torch.tensor(
                [sample[distribution] for sample in processed_data[model][split] if sample['corpus'] == 'HOPE' and sample['speaker'] is not None and sample['dialogue_act'] is not None]
            ).log() 
            for distribution in DISTRIBUTIONS
        } 
        for model in MODELS
    }
    for split in y
}

In [None]:
log_p_lda = {
    split: {
        'LDA': {
            'Prior': torch.tensor([[1 / N_LATENTS] * N_LATENTS] * len(hope[split])).log(),
            'Posterior': torch.tensor(lda.transform(tf.transform([sample['response'] for sample in hope[split]]))).log()
        } 
    }
    for split in hope
}

In [None]:
temperatures = [0.0, 0.333, 0.667, 0.9, 1.0, 1.5, 2.0]

In [None]:
values = []

In [None]:
extrinsic_metrics = {'Homogeneity': homogeneity_score, 'Completeness': completeness_score, 'V-Measure': v_measure_score}
intrinsic_metrics = {'Silhouette': silhouette_score, 'Calinski-Harabasz index': calinski_harabasz_score}

In [None]:
sampled_latents = []

In [None]:
for t in temperatures:
    for k in range(N_REPETITIONS):
        # Sample latent
        c = {
            split: {
                model: {
                    distribution: (
                        torch.multinomial(torch.softmax(log_probs / t, dim=-1), 1).squeeze().numpy()
                        if t > 0 else torch.argmax(log_probs, dim=-1).squeeze().numpy()
                    )
                    for distribution, log_probs in distribution_dict.items()
                } 
                for model, distribution_dict in model_dict.items()
            } 
            for split, model_dict in log_p.items()
        }
        # Add sampling with LDA to have a term of comparison
        c_lda = {
            split: {
                model: {
                    distribution: (
                        torch.multinomial(torch.softmax(log_probs / t, dim=-1), 1).squeeze().numpy()
                        if t > 0 else torch.argmax(log_probs, dim=-1).squeeze().numpy()
                    )
                    for distribution, log_probs in distribution_dict.items()
                } 
                for model, distribution_dict in model_dict.items()
            } 
            for split, model_dict in log_p_lda.items()
        }
        # Store sampled latents
        for split in c:
            for model in c[split]:
                for distribution in c[split][model]:
                    for latent, count in zip(*np.unique(c[split][model][distribution], return_counts=True)):
                        sampled_latents.append((t, k, model, distribution, latent, count, split.capitalize()))
        # Extrinsic metrics
        for metric, f in extrinsic_metrics.items():
            for split in ['validation', 'test']:
                for model, model_dict in c[split].items():
                    for distribution, latent in model_dict.items():
                        for labelling, label in y[split].items():
                            values.append((t, k, metric, f(label, latent), model, distribution, labelling, None, split))
                for model, model_dict in c_lda[split].items():
                    for distribution, latent in model_dict.items():
                        for labelling, label in y[split].items():
                            values.append((t, k, metric, f(label, latent), model, distribution, labelling, None, split))
        # Intrinsic metrics
        for metric, f in intrinsic_metrics.items():
            for split in ['validation', 'test']:
                for model, model_dict in c[split].items():
                    for distribution, latent in model_dict.items():
                        values.append((t, k, metric, f(X[split].toarray(), latent), model, distribution, None, 'TF-IDF', split))
                        # values.append((t, k, metric, f(H[split][model][distribution], latent), model, distribution, None, 'Last hidden state', split))
                for model, model_dict in c_lda[split].items():
                    for distribution, latent in model_dict.items():
                        values.append((t, k, metric, f(X[split].toarray(), latent), model, distribution, None, 'TF-IDF', split))
                        # values.append((t, k, metric, f(H[split][model][distribution], latent), model, distribution, None, 'Last hidden state', split))

In [None]:
df = pd.DataFrame(values, columns=['Sampling temperature', 'Fold', 'Metric', 'Score', 'Model', 'Distribution', 'Label', 'Vector','Split'])

In [None]:
sample_df = pd.DataFrame(sampled_latents, columns=['Sampling temperature', 'Fold', 'Model', 'Distribution', 'Latent', 'Counts', 'Split'])

Plot the averaged distributions

In [None]:
for split in sample_df['Split'].unique():
    tmp_df = sample_df[sample_df['Split'] == split]
    
    fig, axes = plt.subplots(
        nrows=len(tmp_df['Distribution'].unique()), 
        ncols=len(tmp_df['Sampling temperature'].unique()), 
        figsize=(18, 6), 
        sharex=True, 
        sharey=True
    )

    for i, (distribution, distribution_df) in enumerate(tmp_df.groupby('Distribution', sort=False)):
        for j, (t, temperature_df) in enumerate(distribution_df.groupby('Sampling temperature', sort=False)):
            sns.barplot(data=temperature_df, hue='Model', x='Latent', y='Counts', ax=axes[i][j], linewidth=1., edgecolor='0', order=[*range(N_LATENTS)])
            axes[i][j].set_title(f'Split: {split},\nDistribution: {distribution},\nSampling temperature: {t:.3f}')
            # axes[i][j].set_xlabel('Latent')
            # axes[i][j].set_ylabel('Counts')
            # axes[0][i].set_xlim(0.9, tmp_df_coarse[0].max() + 1000)
    plt.tight_layout()
    plt.show()

#### Extrinsic evaluation

In [None]:
tmp_df = df[df['Metric'].isin(extrinsic_metrics)]

In [None]:
sns.relplot(
    data=tmp_df[tmp_df['Split'] == 'validation'], 
    x='Sampling temperature', 
    y='Score', 
    style='Distribution', 
    row='Label', 
    col='Metric', 
    hue='Model', 
    kind='line'
)

In [None]:
sns.relplot(
    data=tmp_df[tmp_df['Split'] == 'test'], 
    x='Sampling temperature', 
    y='Score', 
    style='Distribution', 
    row='Label', 
    col='Metric', 
    hue='Model', 
    kind='line'
)

#### Intrinsic evaluation

In [None]:
tmp_df = df[df['Metric'].isin(intrinsic_metrics)]

In [None]:
sns.relplot(
    data=tmp_df[tmp_df['Split'] == 'validation'], 
    x='Sampling temperature', 
    y='Score', 
    style='Distribution', 
    row='Vector',
    col='Metric', 
    hue='Model', 
    kind='line'
)

In [None]:
sns.relplot(
    data=tmp_df[tmp_df['Split'] == 'test'], 
    x='Sampling temperature', 
    y='Score', 
    style='Distribution', 
    row='Vector',
    col='Metric',  
    hue='Model', 
    kind='line'
)

### Terms distribution

In [None]:
X = {split: 
     {
         speaker: tfidf.transform(
             [sample['response'] for sample in samples]
         )
         for speaker, samples in group_by([s for s in hope[split] if s['corpus'] == 'HOPE' and s['speaker'] is not None and s['dialogue_act'] is not None], 'speaker').items()
     }
     for split in hope
}

In [None]:
hope_data_vectors = {
    split: {
        latent: {
            speaker: {
                DISTRIBUTION_MAP[distribution]: (X[split][speaker].toarray() * np.array([sample[distribution][latent] for sample in samples]).reshape(-1, 1))
                for distribution in DISTRIBUTIONS
            }
            for speaker, samples in group_by([s for s in samples if s['corpus'] == 'HOPE' and s['speaker'] is not None and s['dialogue_act'] is not None], 'speaker').items()
        }
        for latent in range(N_LATENTS)
    } 
    for split, samples in processed_data['Fine-Tuned'].items() if split != 'train'
}

In [None]:
for i, (split, split_data) in enumerate(hope_data_vectors.items()):
    fig, axes = plt.subplots(nrows=16, ncols=6, figsize=(24, 40))
    for j, (latent, latent_data) in enumerate(split_data.items()):
        Z = 
        for k, (speaker, speaker_data) in enumerate(latent_data.items()):
            for d, (distribution, disribution_data) in enumerate(speaker_data.items())
                x = np.asarray(speaker_data.sum(0))[0]
                if distribution in Z:
                    Z[distribution].append(x.copy())
                else:
                    Z[distribution] = [x.copy()]
                idxs = x.argsort()[: -TOP_WORDS - 1 : -1][::-1]
                x = x[idxs]
                y = vocabulary[idxs]
                axes[j][(3 * d) + k].barh(y, x, linewidth=1., edgecolor='0')
                axes[j][(3 * d) + k].set_title(f'Split: {split.capitalize()},\nDialogue act: {action},\nDistribution: {distribution},\nSpeaker: {speaker}')
                axes[j][(3 * d) + k].set_xlabel('Cumulative TF-IDF')
                axes[j][(3 * d) + k].set_ylabel('Word')
        for d, (distribution, distribution_data) in enumerate(Z.items())
            x = sum(distribution_data)
            idxs = x.argsort()[: -TOP_WORDS - 1 : -1][::-1]
            x = x[idxs]
            y = vocabulary[idxs]
            axes[j][(3 * d) - 1].barh(y, x, linewidth=1., edgecolor='0', color='tab:red')
            axes[j][(3 * d) - 1].set_title(f'Split: {split.capitalize()},\nLatent: {action},\nDistribution: {distribution}\n')
            axes[j][(3 * d) - 1].set_xlabel('Cumulative TF-IDF')
            axes[j][(3 * d) - 1].set_ylabel('Word')
        
    plt.tight_layout()
    plt.show()