# Data exploration

## Preliminaries

### Imports

In [None]:
import os

In [None]:
import numpy as np
import pandas as pd

In [None]:
% matplitlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
from transformers import AutoTokenizer

In [None]:
from programmable_chatbot.corpus import CORPORA

In [None]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

### Constants

In [None]:
RAW_DATA_PATH = '../resources/data/raw'

In [None]:
SPLITS = ('train', 'validation', 'test')

In [None]:
RANDOM_SEED = 2307

In [None]:
CORPUS_KWARGS = {
    'Counseling_and_Psychotherapy_Transcripts_Volume_II': {'holdout': 50}
}

In [None]:
DF_COLUMNS = ['corpus_id', 'split', 'dialogue_id', 'utterance_id', 'speaker', 'n_tokens']

In [None]:
OPEN_DOMAIN_DATA = ('dailydialog', 'empatheticdialogues', 'personachat', 'wizard_of_wikipedia', 'IEMOCAP_full_release', 'Topical-Chat-master')
THERAPY_DATA = ('Counseling_and_Psychotherapy_Transcripts_Volume_II', 'HOPE_WSDM_2022', 'Empathy-Mental-Health-master', 'Counsel_Chat')

In [None]:
UNCERTAINTY = False

In [None]:
COLUMNS_DD = []

In [None]:
COLUMNS_IEMOCAP = []
COL_ORDER_ = []
COL_ORDER_ = []
COL_ORDER_ = []
COL_ORDER_LIST = []

In [None]:
COLUMNS_HOPE = []

In [None]:
COLUMNS_EPITOME = []
COL_ORDER_ = []


### Global variables

In [None]:
gpt2_tokenizer = AutoTokenizer.from_pretrained('gpt2')

## Data

Load data sets into dictionary.

In [None]:
data = {
    corpus_id: {
        split: CORPORA[corpus_id](
            os.path.join(RAW_DATA_PATH, corpus_id),
            split,
            gpt2_tokenizer,
            **CORPUS_KWARGS.get(corpus_id, dict())
        ).data
        for split in SPLITS
    }
    for corpus_id in CORPORA
}

Extract info and convert to data frame

In [None]:
df = pd.DataFrame(
    [
        (corpus, split, i, j, utterance.get('speaker'), len(word_tokenize(utterance['text'])))
        for corpus, splits in data.items()
        for split, dialogues in splits.items()
        for i, dialogue in enumerate(dialogues)
        for j, utterance in enumerate(dialogue['utterances'])
    ],
    columns=DF_COLUMNS
)

### Exploration

Compute data set stats

In [None]:
for split in SPLITS:
    for corpus in OPEN_DOMAIN_DATA:
        tmp_df = df[(df['split'] == split) & (df['corpus_id'] == corpus)]
        print(
            f"{:d}",
            f"{:.1f}" + (f" \pm {:.1f}" if UNCERTAINTY else "") ,
            f"{:.1f}" + (f" \pm {:.1f}" if UNCERTAINTY else "") ,
            sep=' & '
        )

In [None]:
for split in SPLITS:
    for corpus in THERAPY_DATA:
        tmp_df = df[(df['split'] == split) & (df['corpus_id'] == corpus)]
        tmp_df_t = tmp_df[tmp_df['speaker'] == 'therapist']
        tmp_df_p = tmp_df[tmp_df['speaker'] == 'patient']
        print(
            f'{:d}',
            f"{:.1f}" + (f" \pm {:.1f}" if UNCERTAINTY else "") ,
            f"{:.1f}" + (f" \pm {:.1f}" if UNCERTAINTY else "") ,
            f"{:.1f}" + (f" \pm {:.1f}" if UNCERTAINTY else "") ,
            f"{:.1f}" + (f" \pm {:.1f}" if UNCERTAINTY else "") ,
            sep=' & '
        )

### Visualisation

#### DailyDialog

In [None]:
fig =

In [None]:
fig.savefig('.pdf', bbox_inches='tight')

In [None]:
fig =

In [None]:
fig.savefig('.pdf', bbox_inches='tight')

In [None]:
fig =

In [None]:
fig.savefig('.pdf', bbox_inches='tight')

#### IEMOCAP

In [None]:
fig =

In [None]:
fig.savefig('.pdf', bbox_inches='tight')

In [None]:
fig =

In [None]:
fig.savefig('.pdf', bbox_inches='tight')

#### HOPE

In [None]:
fig =

In [None]:
fig.savefig('.pdf', bbox_inches='tight')

In [None]:
fig =

In [None]:
fig.savefig('.pdf', bbox_inches='tight')

#### EPITOME

In [None]:
fig =

In [None]:
fig.savefig('.pdf', bbox_inches='tight')