# Evaluation

## Preliminaries

### Imports

In [None]:
import sys
import os
sys.path.append('./src')

In [None]:
import pickle
import bz2

In [None]:
import pandas as pd

In [None]:
from dldlm.chatbot_api.chatbot import DLDLMChatbot

In [None]:
import numpy as np

In [None]:
import torch

In [None]:
import random

### Constants

In [None]:
DATA_PATH = '../resources/data/cache/'

In [None]:
DF_COLUMNS = ['Split', 'Corpus', 'Conversation ID', 'Turn IDX', 'Speaker', 'Utterance']
OUT_DF_COLUMNS = ['Split', 'Corpus', 'Conversation ID', 'Turn IDX', 'Speaker', 'Context', 'Last message', 'Response', 'Model']

In [None]:
GENERATE_KWARGS = {'top_p': 1.0, 'top_k': 0, 'temperature': 0.7, 'do_sample': True}

In [None]:
N_SAMPLES = 100

In [None]:
RANDOM_SEED = 2307

### Random seed

In [None]:
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

## Data

### Load data

In [None]:
with bz2.BZ2File(os.path.join(DATA_PATH, 'evaluation_corpus_test.pbz2'), 'r') as f:
    data = pickle.load(f)

### Convert to Data Frame

In [None]:
df = pd.DataFrame(
    [
        (
            sample['split'],
            sample['corpus'],
            sample['conversation_idx'],
            sample['turn_idx'],
            sample['speaker'],
            sample['response']
        )
        for sample in data
        if sample['corpus'] == 'HOPE'
    ],
    columns=DF_COLUMNS
).sort_values(by=['Conversation ID', 'Turn IDX'])

### Randomly sample conversations

In [None]:
eval_data = random.choices([sample for sample in data if sample['corpus'] == 'HOPE' and sample.get('speaker') == 'Therapist'], k=N_SAMPLES)

## Models

In [None]:
chatbot = DLDLMChatbot(
  '../resources/models/dldlm_pretraining',
  None,
  max_context_len=256,
  max_response_len=128,
  generate_kwargs=GENERATE_KWARGS
)
chatbot.nn_model = chatbot.nn_model.to(torch.device('cuda'))

In [None]:
therabot = DLDLMChatbot(
  '../resources/models/therapy_dldlm',
  None,
  max_context_len=256,
  max_response_len=128,
  generate_kwargs=GENERATE_KWARGS
)
therabot.nn_model = therabot.nn_model.to(torch.device('cuda'))

## Evaluation data preparation

### Responses generation

In [None]:
out_data = []

In [None]:
# Iterate over dialogues
for sample in eval_data:
    # 
    dialogue = df[(df['Conversation ID'] == sample['conversation_idx']) & (df['Turn IDX'] <= sample['turn_idx'])].sort_values(by=['Turn IDX'])
    # Prepare dialogue history
    history = [turn['Utterance'] for _, turn in dialogue[dialogue['Turn IDX'] < sample['turn_idx']].iterrows()]
    message = history[-1] if len(history) >= 1 else ''
    # Prepare context
    context = '\n'.join(
        f"{turn['Speaker']}: {turn['Utterance']}"
        for _, turn in dialogue[dialogue['Turn IDX'] < sample['turn_idx']].iterrows()
    ) + '\n' + 'Therapist: ...'
    context = context.strip()
    # Prepare original response
    original_response = reference_turn['Utterance']
    # Generate response with base model
    response_baseline = chatbot(history)
    # Generate response with fine-tuned model
    response = therabot(history)
    # Add original and generated responses to output data
    out_data.append((sample['split'], sample['corpus'], sample['conversation_idx'], sample['turn_idx'], 'Therapist', context, message, original_response, 'Ground truth'))
    out_data.append((sample['split'], sample['corpus'], sample['conversation_idx'], sample['turn_idx'], 'Therapist', context, message, response_baseline, 'DLDLM'))
    out_data.append((sample['split'], sample['corpus'], sample['conversation_idx'], sample['turn_idx'], 'Therapist', context, message, response, 'Therapy-DLDLM'))

In [None]:
out_df = pd.DataFrame(out_data, columns=OUT_DF_COLUMNS)

### Serialise data

In [None]:
out_df.to_csv(os.path.join(DATA_PATH, 'empathy_assessment_samples.csv'), index=False)