# Evaluation

## Preliminaries

### Imports

In [2]:
import sys
import os
sys.path.append('./src')

In [3]:
import pickle
import bz2
from collections import Counter

In [4]:
from joblib import Parallel
from joblib import delayed
from joblib import parallel_backend

In [5]:
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [6]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

In [7]:
import pandas as pd

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.metrics import (
    homogeneity_score,
    completeness_score,
    v_measure_score,
    silhouette_score,
    calinski_harabasz_score,
)

In [9]:
import numpy as np

In [10]:
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
import random

### Constants and global variables

In [30]:
SPLITS = ('train', 'validation', 'test')
MODELS = ('Pretrained', 'Fine-Tuned')
DATA_SETS = ('DailyDialog', 'EmpatheticDialogues', 'Persona-Chat', 'Wizard of Wikipedia', 'HOPE', 'Counselling and Psychotherapy Transcripts Volume II')
DISTRIBUTIONS = ('latent_prior_dist', 'latent_posterior_dist')

In [28]:
DATA_PATH = '../resources/data/cache/'
PROCESSED_DATA_PATHS = (
    '../experiments/DLDLM_evaluation/dldlm_large_nllt_lda_lr_2022_11_19_16_46_13/data', 
    '../experiments/DLDLM_evaluation/dldlm_large_nllt_gibbs_2022_11_20_09_34_54/data'
)

In [14]:
N_LATENTS = 16

In [15]:
N_COMPONENTS = 1280

In [16]:
N_REPETITIONS = 100

In [17]:
DISTRIBUTION_MAP = {'latent_prior_dist': 'Prior', 'latent_posterior_dist': 'Posterior'}

In [18]:
TOP_WORDS = 20

In [19]:
RANDOM_SEED = 2307

### Random seed

In [20]:
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7f7fc2721f70>

### Helper functions

In [21]:
def load_data(path):
    with bz2.BZ2File(path, 'r') as f:
        data = pickle.load(f)
    return data

In [22]:
def is_same_sample(sample_a, sample_b):
    return sample_a['corpus'] == sample_b['corpus'] and sample_a['conversation_idx'] == sample_b['conversation_idx'] and sample_a['turn_idx'] == sample_b['turn_idx']

In [41]:
def fuzzy_jaccard_similarity(matrix_a, matrix_b):
    return np.minimum(matrix_a.sum(axis=0), matrix_b.sum(axis=0)).sum() / np.maximum(matrix_a.sum(axis=0), matrix_b.sum(axis=0)).sum()

## Data

### Load data

In [31]:
data = {
    split: load_data(os.path.join(DATA_PATH, f'evaluation_corpus_{split}.pbz2')) for split in SPLITS
}

In [32]:
processed_data = {
    model: {split: load_data(os.path.join(data_path, f'evaluation_output_{split}.pbz2')) for split in SPLITS[1:]}
    for model, data_path in zip(MODELS, PROCESSED_DATA_PATHS)
}

In [33]:
assert all(
    is_same_sample(s1, s2) and is_same_sample(s2, s3) 
    for s1, s2, s3 in zip(
        data['validation'], 
        processed_data['Pretrained']['validation'], 
        processed_data['Fine-Tuned']['validation']
    )
)
assert all(
    is_same_sample(s1, s2) and is_same_sample(s2, s3) 
    for s1, s2, s3 in zip(
        data['test'], 
        processed_data['Pretrained']['test'], 
        processed_data['Fine-Tuned']['test']
    )
)

### Preprocessing

In [34]:
tfidf = TfidfVectorizer(
    tokenizer=word_tokenize, 
    stop_words=set(stopwords.words('english')) | set(string.punctuation), 
    max_df=0.95, 
    min_df=2,
    # stop_words='english'
).fit([sample['response'] for split, samples in data.items() for sample in samples])  # Note that data samples are the same



## Generator model analysis

### Single corpora

In [None]:
for model, splits in processed_data.items():
    for split, samples in splits.items()
        print(model, split.capitalize())
        for corpus in DATA_SETS:
            ppl = np.array([sample['ppl'] for sample in samples if sample['corpus'] == corpus])
            print(f'\t{corpus}', f'{ppl.mean():.2f}', f'{ppl.std():.2f}')

### All corpora

In [None]:
for model, splits in processed_data.items():
    for split, samples in splits.items()
        print(model, split.capitalize())
        ppl = np.array([sample['ppl'] for sample in samples])
        print('\tTotal', f'{ppl.mean():.2f}', f'{ppl.std():.2f}')

### Latex

In [None]:
for corpus in DATA_SETS:
    values = []
    for model, splits in processed_data.items():
        for split, samples in splits.items()
            ppl = np.array([sample['ppl'] for sample in samples if sample['corpus'] == corpus])
            values.append(f'{ppl.mean():.2f} \pm {ppl.std():.2f}')
    print(f'\\mathbf{{{corpus}}}', *values, sep=' & ', end='\\\\\n')
print('\\midrule')
values = []
for model, splits in processed_data.items():
    for split, samples in splits.items()
        ppl = np.array([sample['ppl'] for sample in samples])
        values.append(f'{ppl.mean():.2f} \pm {ppl.std():.2f}')
print(f'\\mathbf{{{corpus}}}', *values, sep=' & ', end='\\\\\n')

## Latent model analysis

### Single corpora

In [None]:
latent_results = dict()

for corpus in DATA_SETS:
    latent_results[corpus] = dict()
    for model, splits in processed_data.items():
        latent_results[corpus][model] = dict()
        for split, samples in splits.items():
            latent_results[corpus][model][split] = dict()
            X = tfidf.transform([sample['response'] for sample in samples if sample['corpus'] == corpus])
            for distribution in DISTRIBUTIONS:
                res_mat = np.ones((N_REPETITIONS, N_LATENTS, N_LATENTS))
                probs = torch.tensor([sample[distribution] for sample in samples if sample['corpus'] == corpus])
                z_pred = torch.argmax(probs, dim=1).numpy()
                X_z = np.array([np.squeeze(np.asarray(X[z_pred == i].sum(axis=0))) for i in range(N_LATENTS)])
                for repetition in range(N_REPETITIONS):
                    z = torch.multinomial(probs, 1).squeeze().numpy()
                    for i in range(N_LATENTS):
                        for j in range(i + 1, N_LATENTS):
                            res_mat[repetition, i, j] = res_mat[repetition, j, i] = fuzzy_jaccard_similarity(X[z == i], X[z == j])
                scores = np.array([mat[[*zip(*[(i, j) for i in range(N_LATENTS) for j in range(i + 1, N_LATENTS)])]].mean() for mat in res_mat])
                latent_results[corpus][model][split][DISTRIBUTION_MAP[distribution]] = {
                    'avg': scores.mean(), 
                    'std': scores.std(), 
                    'mat': res_mat.mean(axis=0),
                    'counts': [
                        Counter({
                            word: X_z_i[idx] 
                            for idx, word in zip(
                                np.squeeze(np.asarray(X.sum(axis=0))).nonzero()[0], 
                                tfidf.inverse_transform(np.asarray(X.sum(axis=0)))
                            )
                        }) for X_z_i in X_z 
                    ]
                }

for corpus, corpus_results in latent_results.items():
    for model, model_results in corpus_results.items():
        for split, split_results in model_results.items():
            print(model, split.capitalize(), corpus)
            for distribution, distribution_results in split_results.items(): 
                print(f'\t{distribution}', f'{distribution_results["avg"]:.2f}', f'{distribution_results["std"]:.2f}')

### All corpora

In [None]:
corpus = 'Total'
latent_results[corpus] = dict()
for model, splits in processed_data.items():
    latent_results[corpus][model] = dict()
    for split, samples in splits.items():
        latent_results[corpus][model][split] = dict()
        probs = torch.tensor([sample[distribution] for sample in samples])
        X = tfidf.transform([sample['response'] for sample in samples])
        z_pred = torch.argmax(probs, dim=1).numpy()
        X_z = np.array([np.squeeze(np.asarray(X[z_pred == i].sum(axis=0))) for i in range(N_LATENTS)])
        for distribution in DISTRIBUTIONS:
            res_mat = np.ones(N_REPETITIONS, N_LATENTS, N_LATENTS)
            for repetition in range(N_REPETITIONS):
                z = torch.multinomial(probs, 1).squeeze().numpy()
                for i in range(N_LATENTS):
                    for j in range(i + 1, N_LATENTS):
                        res_mat[repetition, i, j] = res[repetition, j, i] = fuzzy_jaccard_similarity(X[z == i], X[z == j])
            scores = np.array([mat[[*zip(*[(i, j) for i in range(N_LATENTS) for j in range(i + 1, N_LATENTS)])]].mean() for mat in res_mat])
            latent_results[corpus][model][split][DISTRIBUTION_MAP[distribution]] = {
                'avg': scores.mean(), 
                'std': scores.std(), 
                'mat': res_mat.mean(axis=0),
                'counts': [
                    Counter({
                        word: X_z_i[idx] 
                        for idx, word in zip(
                            np.squeeze(np.asarray(X.sum(axis=0))).nonzero()[0], 
                            tfidf.inverse_transform(np.asarray(X.sum(axis=0)))
                        )
                    }) for X_z_i in X_z 
                ]
            }

for model, model_results in latent_results[corpus].items():
    for split, split_results in model_results.items():
        print(model, split.capitalize(), corpus)
        for distribution, distribution_results in split_results.items(): 
            print(f'\t{distribution}', f'{distribution_results["avg"]:.2f}', f'{distribution_results["std"]:.2f}')

### Latex

In [None]:
for corpus, corpus_results in latent_results.items():
    values = []
    for model, model_results in corpus_results.items():
        for split, split_results in model_results.items():
            if corpus == 'Total':
                print('\\midrule')
            for distribution, distribution_results in split_results.items(): 
                values.append(f'{distribution_results['avg']:.2f} \pm {distribution_results['std']:.2f}')
    print(f'\\mathbf{{{corpus}}}', *values, sep=' & ', end='\\\\\n')

### Visualisations

#### All corpora

In [None]:
for corpus in ['Total', 'HOPE', 'Counselling and Psychotherapy Transcripts Volume II']:
    for model in MODELS:
        fig = plt.figure(figsize=(6, 6))
        sns.heatmap(latent_results[corpus][model]['test']['Posterior']['mat'], vmin=0., vmax=1., annot=True, xticklabels=[str(i + 1) for i in range(N_LATENTS)], yticklabels=[str(i + 1) for i in range(N_LATENTS)], cmap='Blues', cbar=False)
        plt.xlabel('Latent code')
        plt.ylabel('Latent code')
        plt.show()
        
        # fig.savefig(f'test_jaccard_cm_{model.lower().replace('-', '_')}_{corpus.lower().replace(' ', '_')}.pdf', bbox_inches='tight')

In [None]:
for corpus in ['Total', 'HOPE', 'Counselling and Psychotherapy Transcripts Volume II']:
    for model in MODELS:
        fig, axes = plt.subplots(nrows=int(math.ceil(N_LATENTS / (N_LATENTS // 4))), ncols=N_LATENTS // 4, figsize=(24, 24))
        z_idx = 0
        for i in range(int(math.ceil(N_LATENTS / (N_LATENTS // 4)))):
            for j in range(N_LATENTS // 4):
                x, y = [*zip(*latent_results[corpus][model]['test']['Posterior']['counts'][z_idx].most_common(TOP_WORDS))]
                y = vocabulary[idxs]
                axes[i][j].barh(y, x, linewidth=1., edgecolor='0')
                axes[i][j].set_title(f'Latent code: {z_idx + 1}')
                axes[i][j].set_xlabel('Cumulative TF-IDF')
                axes[i][j].set_ylabel('Word')
                z_idx += 1
        plt.tight_layout()
        plt.show()
    
        # fig.savefig(f'test_top_words_{model.lower().replace('-', '_')}_{corpus.lower().replace(' ', '_')}.pdf', bbox_inches='tight')