# Inspect sample-level dataframes

We do this as a sanity check.

In [1]:
import os
import sys
import ast
import numpy as np
import pandas as pd

## Pretty printer

In [2]:
colors = {
    'human_response': '\033[92m',
    'model_response': '\033[91m',
    'utterance_index': '\033[93m',
    'label_a': '\033[95m',
    'label_b': '\033[96m',
    'end': '\033[0m'
}


def break_long_utterance(utterance: str, max_chars: int):
    broken_lines = ''
    for idx_char, char in enumerate(utterance):
        if idx_char % max_chars == 0 and idx_char != 0:
            broken_lines += '\n' + ' ' * 8
        broken_lines += char
    return broken_lines


def pretty_print_prompt(
        df_row: pd.Series, 
        model_name: str = None,
        max_chars: int = 80,
):
    prompt = df_row['prompt']
    human_response = df_row['human_response']
    prompt = prompt.replace('\\n', '\n').strip()
    human_response = human_response.replace('\\n', '\n').strip()

    corpus_name = df_row['corpus']
    sample_index = df_row.name
    print(f"--- EXCERPT [{sample_index}] FROM [{corpus_name.upper()}] CORPUS ---")

    # break prompt into lines and speaker labels
    prompt_lines = prompt.split('\n')
    speaker_labels = []
    for line in prompt_lines:
        speaker_labels.append(line[:2])
    prompt_lines = [line[2:].strip() for line in prompt_lines]

    # insert breaks into utterances if they are too long
    for i in range(len(prompt_lines)):
        prompt_lines[i] = break_long_utterance(prompt_lines[i], max_chars)

    # print prompt
    for i in range(len(prompt_lines)):
        # print utterance index
        print(colors['utterance_index'] + f"[{i+1}]  " + colors['end'], end='')
        # print speaker label
        is_a = speaker_labels[i] == 'A:'
        color_utterance = colors['label_a' if is_a else 'label_b']
        print(color_utterance + speaker_labels[i] + colors['end'] + ' ', end='')
        # print utterance
        print(color_utterance + prompt_lines[i] + colors['end'])
    
    # insert breaks into utterances if they are too long
    human_response = break_long_utterance(human_response, max_chars)

    # print human response
    last_utt_string = f"[{len(prompt_lines) + 1}] "
    print(colors['utterance_index'] + last_utt_string + colors['end'], end='')
    print(colors['human_response'] + '[HUMAN]' + colors['end'])
    human_label = human_response[:2]
    human_response = human_response[2:].strip()
    human_color = colors['label_a' if human_label == 'A:' else 'label_b']
    spaces_before = ' ' * len(last_utt_string)
    print(spaces_before + human_color + human_label + colors['end'] + ' ', end='')
    print(human_color + human_response + colors['end'])
    
    # check if model response columns are not empty
    all_cols = df_row.index
    model_response_cols = [col for col in all_cols if col.strip().startswith('model_response_r')]
    model_responses = []
    for col in model_response_cols:
        model_response = df_row[col]
        if isinstance(model_response, str) and len(model_response) > 0:
            model_responses.append(model_responses)
    n_model_responses = len(model_responses)
    if n_model_responses == 0:
        return
    
    # model sample string
    model_string = lambda i: colors['utterance_index'] + last_utt_string + colors['end'] + \
        colors['model_response'] + \
        (f"[MODEL - SAMPLE {i+1}]" if model_name is None else f"[{model_name.upper()} - SAMPLE {i+1}]") + \
        colors['end']

    # print model responses
    for i in range(n_model_responses):
        col_name = f"model_response_r{i}"
        model_response = df_row[col_name]
        model_response = model_response.replace('\\n', ' ').strip()
        model_response = human_label + ' ' + model_response
        model_response = break_long_utterance(model_response, max_chars)
        print(model_string(i))
        print(spaces_before + human_color + model_response + colors['end'])


## Inspection

In [3]:
df_path = '../../data/samples_mini_gpt2_genq_constr_ppl_ol_pmi.tsv'

# read the dataframe
df = pd.read_csv(df_path, sep='\t', header=0, index_col='sample_index')
print('Rows: %d' % len(df))
print('Columns: %d' % len(df.columns))
print('Index: %s' % df.index.name)

# print columns
print('\nCOLUMNS:')
columns = df.columns
for column in columns:
    example_value = df[column].iloc[0]
    try:
        if len(example_value) > 60:
            example_value = example_value[:60] + '...'
    except:
        pass
    print(f'[{column}]: {example_value}')

Rows: 10
Columns: 175
Index: sample_index

COLUMNS:
[turns_in_diag]: 88
[first_utt_idx_in_diag]: 23
[human_response]: A:Yeah.
[prompt]: B:Oh, for sure.. Oh, yeah.\nA:So, I've had this dog now for,...
[corpus]: switchboard
[model_response_r0]: Uh.. but, you know,. a pet. I mean, I sometimes have, um, pe...
[attribs_r0]: -0.8380187153816223|-0.19676153361797333|-0.1018861606717109...
[attribs_tag_r0]: -0.23460747301578522|-0.39912521839141846|-0.287821799516677...
[model_response_r1]: Ah, now, uh.. I've been thinking about that.\n
[attribs_r1]: 0.10311585664749146|-0.2600320279598236|-0.44083258509635925...
[attribs_tag_r1]: -0.6929109692573547|0.0055464101023972034|-0.922957122325897...
[model_response_r2]: Uh..\n
[attribs_r2]: 1.0|-0.0011756449239328504|-0.028232935816049576|-0.00375789...
[attribs_tag_r2]: -0.46264511346817017|0.19447404146194458|0.2580307722091675|...
[model_response_r3]: Right . . . it would lay down on me and that's, it. and, uhâ€¦...
[attribs_r3]: 0.06948333233594

In [4]:
corpus = 'switchboard'
model_name = 'gpt2'
n_samples = 2

df_corpus = df[df['corpus'] == corpus]
df_samples = df_corpus.sample(n_samples)

for row_idx, row in df_samples.iterrows():
    if row['corpus'] == corpus:
        pretty_print_prompt(row, model_name, max_chars=60)
        print()

--- EXCERPT [670] FROM [SWITCHBOARD] CORPUS ---
[93m[1]  [0m[95mA:[0m [95mHuh. Well the, the, uh, clubs that we've got around here are
         kind of expensive. but, uh, it's well worth it.. You can go
         down and shoot up against a sand bag and some targets and h
        ave, uh, competition.. Uh, I've never really joined a club b
        ecause I haven't got the time. Not because I haven't got the
         desire.. Uh, there's an annual membership fee, uh, that's, 
        that's fairly high.. Plus there's an initiation fee that you
         have to pay because of first time, uh, uh, member.. So the 
        whole process can cost you a hundred fifty dollars to join t
        he club.. Plus you have to pay for the, the ammunition in an
        y tournaments that you would join in.[0m
[93m[2]  [0m[96mB:[0m [96mYeah.. Well, that's what my husband,. when he was in that gu
        n club where he was doing that tell if it's a cough or somet
        hing was. It, you kn