# Evaluation

## Preliminaries

### Imports

In [1]:
import sys
import os
sys.path.append('./src')

In [2]:
import pickle
import bz2

In [3]:
import pandas as pd

In [4]:
from dldlm.chatbot_api.chatbot import DLDLMChatbot

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import numpy as np

In [6]:
import torch

In [7]:
import random

### Constants

In [8]:
DATA_PATH = '../resources/data/cache/'

In [9]:
DF_COLUMNS = ['Split', 'Corpus', 'Conversation ID', 'Turn IDX', 'Speaker', 'Utterance']
OUT_DF_COLUMNS = ['Split', 'Corpus', 'Conversation ID', 'Turn IDX', 'Speaker', 'Context', 'Utterance', 'Label']

In [10]:
# GENERATE_KWARGS = {'top_p': 1.0, 'top_k': 0, 'temperature': 0.95, 'do_sample': True}
# GENERATE_KWARGS = {'top_p': 0.95, 'top_k': 0, 'temperature': 1.0, 'do_sample': True}
# GENERATE_KWARGS = {'top_p': 1.0, 'top_k': 4, 'temperature': 1.0, 'do_sample': False, 'penalty_alpha': 0.6}
GENERATE_KWARGS = {'top_p': 1.0, 'top_k': 0, 'temperature': 0.7, 'do_sample': True}

In [11]:
N_SAMPLES = 20

In [12]:
RANDOM_SEED = 2307

### Random seed

In [13]:
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7f30ea4ba3f0>

## Data

### Load data

In [14]:
with bz2.BZ2File(os.path.join(DATA_PATH, 'evaluation_corpus_test.pbz2'), 'r') as f:
    data = pickle.load(f)

### Convert to Data Frame

In [15]:
df = pd.DataFrame(
    [
        (
            sample['split'],
            sample['corpus'],
            sample['conversation_idx'],
            sample['turn_idx'],
            sample['speaker'],
            sample['response']
        )
        for sample in data
        if sample['corpus'] == 'HOPE'
    ],
    columns=DF_COLUMNS
).sort_values(by=['Conversation ID', 'Turn IDX'])

### Randomly subsample conversations

In [16]:
conversation_ids = df['Conversation ID'].unique()
np.random.shuffle(conversation_ids)
conversation_ids = conversation_ids[:N_SAMPLES]

df = df[df['Conversation ID'].isin(conversation_ids)]

## Models

In [17]:
chatbot = DLDLMChatbot(
  '../resources/models/dldlm',
  max_context_len=256,
  max_response_len=128,
  generate_kwargs=GENERATE_KWARGS
)
chatbot.nn_model = chatbot.nn_model.to(torch.device('cuda'))

In [18]:
therabot = DLDLMChatbot(
  '../resources/models/therabot',
  max_context_len=256,
  max_response_len=128,
  generate_kwargs=GENERATE_KWARGS
)
therabot.nn_model = therabot.nn_model.to(torch.device('cuda'))

## Evaluation data preparation

### Responses generation

In [19]:
out_data = []

In [20]:
# Iterate over dialogues
for i, (dialogue_id, dialogue) in enumerate(df.groupby('Conversation ID', sort=False)):
    # Sample reference turn
    s_idx = int(len(dialogue) * (0.15 if i % 2 == 0 else 0.45))
    e_idx = int(len(dialogue) * (0.25 if i % 2 == 0 else 0.55))
    reference_turn = dialogue[
        (dialogue['Speaker'] == 'Therapist') & (dialogue['Turn IDX'] >= s_idx) & (dialogue['Turn IDX'] < e_idx)
    ].sample(1).iloc[0]
    # Turn metadata
    split = reference_turn['Split']
    corpus = reference_turn['Corpus']
    turn_idx = reference_turn['Turn IDX']
    # Prepare dialogue history
    history = [turn['Utterance'] for _, turn in dialogue[dialogue['Turn IDX'] < turn_idx].iterrows()]
    # Prepare context
    context = '\n'.join(
        f"{turn['Speaker']}: {turn['Utterance']}"
        for _, turn in dialogue[dialogue['Turn IDX'] < turn_idx].iterrows()
    ) + '\n' + 'Therapist: ...'
    # Prepare original response
    response = reference_turn['Utterance']
    # Generate response with base model
    response_baseline = chatbot(history)
    # Generate response with fine-tuned model
    response_therabot = therabot(history)
    # Add original and generated responses to output data
    out_data.append((split, corpus, dialogue_id, turn_idx, 'Therapist', context, response, 'Ground truth'))
    out_data.append((split, corpus, dialogue_id, turn_idx, 'Therapist', context, response_baseline, 'DLDLM'))
    out_data.append((split, corpus, dialogue_id, turn_idx, 'Therapist', context, response_therabot, 'TheraBot'))

In [21]:
out_df = pd.DataFrame(out_data, columns=OUT_DF_COLUMNS)

### Display data

In [22]:
for dialogue_id, dialogue_samples in out_df.groupby('Conversation ID', sort=False):
    # Shuffle rows
    dialogue_samples = dialogue_samples.sample(frac=1)
    # Iterate over alternative responses
    print('--------')
    for i, (_, sample) in enumerate(dialogue_samples.iterrows()):
        # If it's the first print the context
        if i == 0:
            print(sample['Context'])
        print('\n')
        print(f"Therapist: {sample['Utterance']}")
    print('--------')

--------
Therapist: Hi Angela, how are you doing today ?
Patient: I am all right,
Therapist: you, all right ?
Patient: Yeah. There's been some changes in my life but um, yeah.
Therapist: Good changes ?
Patient: Uhh Not really.
Therapist: Not really.
Patient: Yeah.
Therapist: What kind of changes ?
Patient: You know, just, you know, uh, you know, I lost my job. So, things are kinda you know, dominal effects from that. So,
Therapist: so there's been consequences from those in your job.
Patient: Yeah, I mean, obviously, that's, you know, you have a job. So you can take care of certain things. So, yeah,
Therapist: like what things ?
Patient: like you know like paying your bills and that's pretty much it. I guess that's really why most people have a job.
Therapist: See, you've been unable to pay your bills.
Patient: Yeah, yeah, definitely.
Therapist: How's your mood overall?
Patient: I mean, what can I say I lost my job. I'm seeing you because I have a drinking problem. So things are not th

### Serialise data

In [23]:
# out_df.to_csv(os.path.join(DATA_PATH, 'human_evaluation_samples_temp.csv'), index=False)
# out_df.to_csv(os.path.join(DATA_PATH, 'human_evaluation_samples_nucl.csv'), index=False)
#Â out_df.to_csv(os.path.join(DATA_PATH, 'human_evaluation_samples_cont.csv'), index=False)
out_df.to_csv(os.path.join(DATA_PATH, 'human_evaluation_samples_alt.csv'), index=False)