# Data exploration

## Preliminaries

### Imports

In [None]:
import os

In [None]:
import numpy as np
import pandas as pd

In [None]:
% matplitlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
from transformers import AutoTokenizer

In [None]:
from programmable_chatbot.corpus import CORPORA

In [None]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

### Constants

In [None]:
RAW_DATA_PATH = '../resources/data/raw'

In [None]:
SPLITS = ('train', 'validation', 'test')

In [None]:
RANDOM_SEED = 2307

In [None]:
CORPUS_KWARGS = {
    'Counseling_and_Psychotherapy_Transcripts_Volume_II': {'holdout': 50}
}

In [None]:
DF_COLUMNS = ['corpus_id', 'split', 'dialogue_id', 'utterance_id', 'speaker', 'n_tokens']

In [None]:
OPEN_DOMAIN_DATA = ('dailydialog', 'empatheticdialogues', 'personachat', 'wizard_of_wikipedia', 'IEMOCAP_full_release', 'Topical-Chat-master')
THERAPY_DATA = ('Counseling_and_Psychotherapy_Transcripts_Volume_II', 'HOPE_WSDM_2022', 'Empathy-Mental-Health-master', 'Counsel_Chat')

In [None]:
UNCERTAINTY = False

In [None]:
COLUMNS_DD_LOCAL = ['Split', 'Emotion', 'Dialogue act']
COLUMNS_DD_GLOBAL = ['Split', 'Topic']

In [None]:
COLUMNS_IEMOCAP = ['Split', 'Emotion', 'Valence', 'Activation', 'Dominance']
COL_ORDER_I_V = [val.capitalize() for val in ['very negative', 'negative', 'neutral', 'positive', 'very positive'][::-1]]
COL_ORDER_I_A = [val.capitalize() for val in ['very low', 'low', 'medium', 'high', 'very high'][::-1]]
COL_ORDER_I_D = [val.capitalize() for val in ['very weak', 'weak', 'medium', 'strong', 'very strong'][::-1]]
COL_ORDER_I_LIST = [COL_ORDER_I_V, COL_ORDER_I_A, COL_ORDER_I_D]

In [None]:
COLUMNS_HOPE = ['Split', 'Speaker', 'Dialogue act', 'Dialogue act category']
HUE_ORDER_HOPE = ['Therapist', 'Patient']

In [None]:
COLUMNS_EPITOME = ['Split', 'Emotional reaction', 'Exploration', 'Interpretation']
COL_ORDER_E = [val.capitalize() for val in ['no communication', 'weak communication', 'strong communication'][::-1]]

### Helper functions

In [None]:
def preprocess_metadata(s: str) -> str:
    return s.capitalize().replace('_', ' ')

def get_metadata_column_id(s: str) -> str:
    return s.lower().replace(' ', '_')

### Global variables

In [None]:
gpt2_tokenizer = AutoTokenizer.from_pretrained('gpt2')

## Data

Load data sets into dictionary.

In [None]:
data = {
    corpus_id: {
        split: CORPORA[corpus_id](
            os.path.join(RAW_DATA_PATH, corpus_id),
            split,
            gpt2_tokenizer,
            **CORPUS_KWARGS.get(corpus_id, dict())
        ).data
        for split in SPLITS
    }
    for corpus_id in CORPORA
}

Extract info and convert to data frame

In [None]:
df = pd.DataFrame(
    [
        (corpus, split, i, j, utterance.get('speaker'), len(word_tokenize(utterance['text'])))
        for corpus, splits in data.items()
        for split, dialogues in splits.items()
        for i, dialogue in enumerate(dialogues)
        for j, utterance in enumerate(dialogue['utterances'])
    ],
    columns=DF_COLUMNS
)

### Exploration

Compute data set stats

In [None]:
for split in SPLITS:
    for corpus in OPEN_DOMAIN_DATA:
        tmp_df = df[(df['split'] == split) & (df['corpus_id'] == corpus)]
        print(
            f"{:d}",
            f"{:.1f}" + (f" \pm {:.1f}" if UNCERTAINTY else "") ,
            f"{:.1f}" + (f" \pm {:.1f}" if UNCERTAINTY else "") ,
            sep=' & '
        )

In [None]:
for split in SPLITS:
    for corpus in THERAPY_DATA:
        tmp_df = df[(df['split'] == split) & (df['corpus_id'] == corpus)]
        tmp_df_t = tmp_df[tmp_df['speaker'] == 'therapist']
        tmp_df_p = tmp_df[tmp_df['speaker'] == 'patient']
        print(
            f'{:d}',
            f"{:.1f}" + (f" \pm {:.1f}" if UNCERTAINTY else "") ,
            f"{:.1f}" + (f" \pm {:.1f}" if UNCERTAINTY else "") ,
            f"{:.1f}" + (f" \pm {:.1f}" if UNCERTAINTY else "") ,
            f"{:.1f}" + (f" \pm {:.1f}" if UNCERTAINTY else "") ,
            sep=' & '
        )

### Visualisation

#### DailyDialog

In [None]:
tmp_df = pd.DataFrame(
    [
        (split, *(utterance[get_metadata_column_id(label)] for label in COLUMNS_DD_LOCAL[1:]))
        for split, dialogues in data['dailydialog'].items()
        for dialogue in dialogues
        for utterance in dialogue['utterances']
    ],
    columns=COLUMNS_DD_LOCAL
)

In [None]:
fig = plt.figure(figsize=())
sns.countplot(data=tmp_df, x='Emotion', hue='Split', linewidth=1., edgecolor='0')
plt.yscale('log')
plt.ylim([1, 1.e5])
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
fig.savefig('ddemo.pdf', bbox_inches='tight')

In [None]:
fig =

In [None]:
fig.savefig('ddda.pdf', bbox_inches='tight')

In [None]:
tmp_df = pd.DataFrame(
    [
        (split, *(dialogue[get_metadata_column_id(label)] for label in COLUMNS_DD_GLOBAL[1:]))
        for split, dialogues in data['dailydialog'].items()
        for dialogue in dialogues
    ],
    columns=COLUMNS_DD_GLOBAL
)

In [None]:
fig =

In [None]:
fig.savefig('ddtopic.pdf', bbox_inches='tight')

#### IEMOCAP

In [None]:
tmp_df = pd.DataFrame(
    [
        (split, *(utterance[get_metadata_column_id(label)] for label in COLUMNS_IEMOCAP[1:]))
        for split, dialogues in data['IEMOCAP_full_release'].items()
        for dialogue in dialogues
        for utterance in dialogue['utterances']
    ],
    columns=COLUMNS_IEMOCAP
)


In [None]:
fig =

In [None]:
fig.savefig('iemocapemocat.pdf', bbox_inches='tight')

In [None]:
fig, axes = plt.subplots(
    nrows=1,
    ncols=3,
    figsize=(),
    sharex=True
)

for i, (label, label_order) in enumerate(zip(COLUMNS_IEMOCAP[-3:], COL_ORDER_I_LIST)):
    sns.countplot(
        data=tmp_df, hue='Split', y=label, order=label_order, ax=axes[i], linewidth=1., edgecolor='0', orient='h'
    )
    axes[i].set_title(f'Dimension: {label}')
    axes[i].set_xscale('log')
    axes[0][i].set_xlim([1, 1e5])
    axes[i].set_xlabel('Count')

plt.tight_layout()
plt.show()

In [None]:
fig.savefig('iemocapemodim.pdf', bbox_inches='tight')

#### HOPE

In [None]:
tmp_df = pd.DataFrame(
    [
        (split, *(utterance[get_metadata_column_id(label)] for label in COLUMNS_HOPE[1:]))
        for split, dialogues in data['HOPE_WSDM_2022'].items()
        for dialogue in dialogues
        for utterance in dialogue['utterances']
    ],
    columns=COLUMNS_IEMOCAP
)

In [None]:
fig, axes = plt.subplots(
    nrows=1,
    ncols=len(tmp_df['Split'].unique()),
    figsize=(),
    sharex=True,
    sharey=True
)

for i, split in enumerate(tmp_df['split'].unique()):
    sns.countplot(
        data=tmp_df[tmp_df['Split'] == split], hue='Speaker', x=0, y='Dialogue act category', hue_order=HUE_ORDER_HOPE,
        ax=axes[i], linewidth=1., edgecolor='0', orient='h'
    )
    axes[i].set_title(f'Split: {split}')
    axes[i].set_xscale('log')
    axes[0][i].set_xlim([1, 1e5])
    axes[i].set_xlabel('Count')

plt.tight_layout()
plt.show()

In [None]:
fig.savefig('hopedac.pdf', bbox_inches='tight')

In [None]:
fig =

In [None]:
fig.savefig('hopeda.pdf', bbox_inches='tight')

#### EPITOME

In [None]:
tmp_df = pd.DataFrame(
    [
        (split, *(dialogue['utterances'][-1][get_metadata_column_id(label)] for label in COLUMNS_EPITOME[1:]))
        for split, dialogues in data['Empathy-Mental-Health-master'].items()
        for dialogue in dialogues
    ],
    columns=COLUMNS_IEMOCAP
)

In [None]:
fig, axes = plt.subplots(
    nrows=1,
    ncols=len(COLUMNS_EPITOME[1:]),
    figsize=(),
    sharex=True
)

for i, label in enumerate(COLUMNS_EPITOME[1:]):
    sns.countplot(
        data=tmp_df, hue='Split', y=label, order=COL_ORDER_E, ax=axes[i], linewidth=1., edgecolor='0', orient='h'
    )
    axes[i].set_title(label)
    axes[i].set_xscale('log')
    axes[0][i].set_xlim([1, 1e5])
    axes[i].set_xlabel('Count')

plt.tight_layout()
plt.show()

In [None]:
fig.savefig('epitomeemp.pdf', bbox_inches='tight')