# Evaluation

## Preliminaries

### Imports

In [1]:
import sys
import os
sys.path.append('./src')

In [2]:
import pickle
import bz2

In [3]:
import pandas as pd

In [4]:
from dldlm.chatbot_api.chatbot import DLDLMChatbot

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import numpy as np

In [6]:
import torch

In [7]:
import random

### Constants

In [8]:
DATA_PATH = '../resources/data/cache/'

In [9]:
DF_COLUMNS = ['Split', 'Corpus', 'Conversation ID', 'Turn IDX', 'Speaker', 'Utterance']
OUT_DF_COLUMNS = ['Split', 'Corpus', 'Conversation ID', 'Turn IDX', 'Speaker', 'Context', 'Last message', 'Response', 'Model']

In [10]:
GENERATE_KWARGS = {'top_p': 1.0, 'top_k': 0, 'temperature': 0.7, 'do_sample': True}

In [11]:
N_SAMPLES = 100

In [12]:
RANDOM_SEED = 2307

### Random seed

In [13]:
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7f4bbe3a63f0>

## Data

### Load data

In [14]:
with bz2.BZ2File(os.path.join(DATA_PATH, 'evaluation_corpus_test.pbz2'), 'r') as f:
    data = pickle.load(f)

### Convert to Data Frame

In [15]:
df = pd.DataFrame(
    [
        (
            sample['split'],
            sample['corpus'],
            sample['conversation_idx'],
            sample['turn_idx'],
            sample['speaker'],
            sample['response']
        )
        for sample in data
        if sample['corpus'] == 'HOPE'
    ],
    columns=DF_COLUMNS
).sort_values(by=['Conversation ID', 'Turn IDX'])

### Randomly sample conversations

In [16]:
eval_data = random.choices([sample for sample in data if sample['corpus'] == 'HOPE' and sample.get('speaker') == 'Therapist'], k=N_SAMPLES)

## Models

In [17]:
chatbot = DLDLMChatbot(
    '../resources/models/dldlm_pretraining',
    None,
    max_context_len=256,
    max_response_len=128,
    generate_kwargs=GENERATE_KWARGS
)

In [18]:
therabot = DLDLMChatbot(
  '../resources/models/therapy_dldlm',
  None,
  max_context_len=256,
  max_response_len=128,
  generate_kwargs=GENERATE_KWARGS
)

## Evaluation data preparation

### Responses generation

In [19]:
out_data = []

In [20]:
# Iterate over dialogues
for sample in eval_data:
    #
    dialogue = df[(df['Conversation ID'] == sample['conversation_idx']) & (df['Turn IDX'] <= sample['turn_idx'])].sort_values(by=['Turn IDX'])
    # Prepare dialogue history
    history = [turn['Utterance'] for _, turn in dialogue[dialogue['Turn IDX'] < sample['turn_idx']].iterrows()]
    message = history[-1] if len(history) >= 1 else ''
    # Prepare context
    context = '\n'.join(
        f"{turn['Speaker']}: {turn['Utterance']}"
        for _, turn in dialogue[dialogue['Turn IDX'] < sample['turn_idx']].iterrows()
    ) + '\n' + 'Therapist: ...'
    context = context.strip()
    # Prepare original response
    original_response = sample['response']
    # Generate response with base model
    response_baseline = chatbot(history)
    # Generate response with fine-tuned model
    response = therabot(history)
    # Add original and generated responses to output data
    out_data.append((sample['split'], sample['corpus'], sample['conversation_idx'], sample['turn_idx'], 'Therapist', context, f'Patient: {message}', f'Therapist: {original_response}', 'Ground truth'))
    out_data.append((sample['split'], sample['corpus'], sample['conversation_idx'], sample['turn_idx'], 'Therapist', context, f'Patient: {message}', f'Therapist: {response_baseline}', 'DLDLM'))
    out_data.append((sample['split'], sample['corpus'], sample['conversation_idx'], sample['turn_idx'], 'Therapist', context, f'Patient: {message}', f'Therapist: {response}', 'Therapy-DLDLM'))

Token indices sequence length is longer than the specified maximum sequence length for this model (1589 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1589 > 1024). Running this sequence through the model will result in indexing errors


In [21]:
out_df = pd.DataFrame(out_data, columns=OUT_DF_COLUMNS)
out_df

Unnamed: 0,Split,Corpus,Conversation ID,Turn IDX,Speaker,Context,Last message,Response,Model
0,test,HOPE,10,14,Therapist,"Therapist: All right, Chris, so I need to talk...","Patient: Well, supermom.","Therapist: Yeah. So the issue again, is, I can...",Ground truth
1,test,HOPE,10,14,Therapist,"Therapist: All right, Chris, so I need to talk...","Patient: Well, supermom.",Therapist: Sonny.,DLDLM
2,test,HOPE,10,14,Therapist,"Therapist: All right, Chris, so I need to talk...","Patient: Well, supermom.",Therapist: All right.,Therapy-DLDLM
3,test,HOPE,20,2,Therapist,Therapist: What's going on? What are you feeli...,Patient: I'm feeling disgust,Therapist: Disgust about what.,Ground truth
4,test,HOPE,20,2,Therapist,Therapist: What's going on? What are you feeli...,Patient: I'm feeling disgust,Therapist: Really? Why?,DLDLM
...,...,...,...,...,...,...,...,...,...
295,test,HOPE,37,22,Therapist,"Therapist: Hello, Susan.\nPatient: Hello\nTher...",Patient: okay. Okay.,"Therapist: Okay, so like a little confession. ...",DLDLM
296,test,HOPE,37,22,Therapist,"Therapist: Hello, Susan.\nPatient: Hello\nTher...",Patient: okay. Okay.,Therapist: So I have a lot of things that I ne...,Therapy-DLDLM
297,test,HOPE,7,10,Therapist,Therapist: How are you? Not well?\nPatient: No...,"Patient: Yeah. I was, I thought, you know, I e...","Therapist: and that's not gonna happen. No, no...",Ground truth
298,test,HOPE,7,10,Therapist,Therapist: How are you? Not well?\nPatient: No...,"Patient: Yeah. I was, I thought, you know, I e...",Therapist: Yeah. You should talk to the manage...,DLDLM


### Serialise data

In [22]:
out_df.to_csv(os.path.join(DATA_PATH, 'therapy_dldlm_empathy_assessment_samples.csv'), index=False)