# AnnoMI 데이터셋을 T5 학습에 필요한 형태로 전처리하기
- 가상환경 t5

In [13]:
import json

with open('../preprocessed_data/AnnoMI-full_v5.0.json', 'r') as f:
    full = json.load(f)
with open('../preprocessed_data/AnnoMI-full_v5.0_high.json', 'r') as f:
    high = json.load(f)
with open('../preprocessed_data/AnnoMI-full_v5.0_low.json', 'r') as f:
    low = json.load(f)

In [14]:
print(len(full))
print(len(high))
print(len(low))

133
110
23


## Window size에 따른 데이터 수 확인
- 같은 speaker가 연속으로 나오는 경우도 잘 처리함 (ex. dialogue = high[10]['dialogue'])

In [3]:
# window_size: input으로 주는 dialogue history의 utterance 수
def get_subsets(dialogue, window_size, last_speaker, allow_shorter_initial_subsets=False):
    subsets = []
    for i in range(len(dialogue)):
        if dialogue[i]['speaker'] == last_speaker:
            start = max(0, i - window_size)
            subset = dialogue[start:i+1]
            # Check if the subset ends with the desired speaker
            if subset[-1]['speaker'] == last_speaker:
                # Ensure the subset meets the window size requirement
                if len(subset) == window_size + 1:
                    subsets.append(subset)
                # Handle shorter initial subsets based on the flag
                elif allow_shorter_initial_subsets and start == 0:
                    subsets.append(subset)
    
    return subsets

In [4]:
# Window size에 따른 데이터 수 계산
window_min = 1
window_max = 8

for last_speaker in ['therapist', 'client']:
    for window_size in range(window_min, window_max + 1):
        count = 0
        
        for dialogue in high:
            count += len(get_subsets(dialogue['dialogue'], window_size, last_speaker))
            
        print(f"predicting {last_speaker}'s label (window size: {window_size}): {count}")

predicting therapist's label (window size: 1): 4346
predicting therapist's label (window size: 2): 4329
predicting therapist's label (window size: 3): 4236
predicting therapist's label (window size: 4): 4219
predicting therapist's label (window size: 5): 4126
predicting therapist's label (window size: 6): 4109
predicting therapist's label (window size: 7): 4016
predicting therapist's label (window size: 8): 3999
predicting client's label (window size: 1): 4381
predicting client's label (window size: 2): 4288
predicting client's label (window size: 3): 4271
predicting client's label (window size: 4): 4178
predicting client's label (window size: 5): 4161
predicting client's label (window size: 6): 4068
predicting client's label (window size: 7): 4051
predicting client's label (window size: 8): 3959


## 데이터셋 전처리 & 저장

In [15]:
# String의 첫 글자 대문자로
def capitalize(s):
    if not s:
        return s  # Return the original string if it's empty
    return s[0].upper() + s[1:]

label_name_finegrained = {
    
    # Therapist
    'reflection_simple': 'Simple Reflection',
    'reflection_complex': 'Complex Reflection',
    'question_open': 'Open Question',
    'question_closed': 'Closed Question',
    'input_advice': 'Advice',
    'input_information': 'Information',
    'input_negotiation': 'Negotiation',
    'input_options': 'Options',
    'other': 'Other',
    
    # # Client
    # 'change': 'Change',
    # 'neutral': 'Neutral',
    # 'sustain': 'Sustain'
    
    # Client (Change & Not Change의 Binary Classification으로)
    'change': 'Change',
    'neutral': 'Not Change',
    'sustain': 'Not Change'
    
}

label_name_coarse = {
    
    'reflection_simple': 'Reflection',
    'reflection_complex': 'Reflection',
    'question_open': 'Question',
    'question_closed': 'Question',
    'input_advice': 'Input',
    'input_information': 'Input',
    'input_negotiation': 'Input',
    'input_options': 'Input',
    'other': 'Other',
    
    # # Client
    # 'change': 'Change',
    # 'neutral': 'Neutral',
    # 'sustain': 'Sustain'
    
    # Client (Change & Not Change의 Binary Classification으로)
    'change': 'Change',
    'neutral': 'Not Change',
    'sustain': 'Not Change'
    
}

In [6]:
# %%time
# # AnnoMI-full_v3.1 & finegrained/coarse 사용했을 때
# import pandas as pd

# window_min = 1
# window_max = 8

# filename_list = []

# for last_speaker in ['therapist', 'client']: # last_speaker: predict할 speaker
#     for window_size in range(window_min, window_max + 1):
#         for include_label in [True, False]:
#             for granularity in ['finegrained', 'coarse']:
#                 if granularity == 'finegrained':
#                     label_name = label_name_finegrained
#                 elif granularity == 'coarse':
#                     label_name = label_name_coarse
            
#                 source = []
#                 target = []
            
#                 for dialogue in high: # high 데이터만 사용        
#                     subsets = get_subsets(dialogue['dialogue'], window_size, last_speaker)
#                     for subset in subsets: 
                        
#                         # Source Text (Input)
#                         source_text = f"Predict next {last_speaker}'s dialogue act: "

#                         for utterance in subset[:-1]: # 마지막 utterance는 label로 사용
#                             # Label
#                             if include_label:
#                                 source_text += f"[{capitalize(utterance['speaker'])}: "
#                                 for label in utterance[f"{utterance['speaker']}_label"]:
#                                     source_text += label_name[label]
#                                     source_text += ", "
#                                 source_text = source_text[:-2]
#                                 source_text += '] '
#                             else:
#                                 source_text += f"[{capitalize(utterance['speaker'])}] "

#                             # Utterance
#                             source_text += utterance['utterance']
#                             source_text += ' '
                        
#                         source.append(source_text.strip())

#                         # Target Text (Output)
#                         target_text = f"[{capitalize(subset[-1]['speaker'])}: "
#                         for label in subset[-1][f"{subset[-1]['speaker']}_label"]:
#                             target_text += label_name[label]
#                             target_text += ", "
#                         target_text = target_text[:-2]
#                         target_text += ']'
                        
#                         target.append(target_text)
                        
#                 df = pd.DataFrame({'source_text': source, 'target_text': target})
                
#                 # pred-client, inputlabel-False이면 coarse/finegrained 의미 없음 
#                 # (therapist label이 한 번도 등장 안 해서 똑같은 데이터임)
#                 if last_speaker == 'client' and include_label == False:
#                     filename = f'AnnoMI_dataset-high_pred-{last_speaker}_window-{window_size}_inputlabel-{include_label}_none_{len(df)}.csv'
#                 else:
#                     filename = f'AnnoMI_dataset-high_pred-{last_speaker}_window-{window_size}_inputlabel-{include_label}_{granularity}_{len(df)}.csv'

#                 if filename not in filename_list:
#                     filename_list.append(filename)
#                     df.to_csv(f't5_dataset/{filename}', index=False)
#                     print(f'Saved {filename}')

Saved AnnoMI_dataset-high_pred-therapist_window-1_inputlabel-True_finegrained_4347.csv
Saved AnnoMI_dataset-high_pred-therapist_window-1_inputlabel-True_coarse_4347.csv
Saved AnnoMI_dataset-high_pred-therapist_window-1_inputlabel-False_finegrained_4347.csv
Saved AnnoMI_dataset-high_pred-therapist_window-1_inputlabel-False_coarse_4347.csv
Saved AnnoMI_dataset-high_pred-therapist_window-2_inputlabel-True_finegrained_4330.csv
Saved AnnoMI_dataset-high_pred-therapist_window-2_inputlabel-True_coarse_4330.csv
Saved AnnoMI_dataset-high_pred-therapist_window-2_inputlabel-False_finegrained_4330.csv
Saved AnnoMI_dataset-high_pred-therapist_window-2_inputlabel-False_coarse_4330.csv
Saved AnnoMI_dataset-high_pred-therapist_window-3_inputlabel-True_finegrained_4237.csv
Saved AnnoMI_dataset-high_pred-therapist_window-3_inputlabel-True_coarse_4237.csv
Saved AnnoMI_dataset-high_pred-therapist_window-3_inputlabel-False_finegrained_4237.csv
Saved AnnoMI_dataset-high_pred-therapist_window-3_inputlabel-Fa

In [16]:
%%time
# AnnoMI-full_v5.0 & integrated 사용했을 때
import pandas as pd

window_min = 1
window_max = 8

filename_list = []

for last_speaker in ['therapist', 'client']: # last_speaker: predict할 speaker
    for window_size in range(window_min, window_max + 1):
        for include_label in [True, False]:
            for granularity in ['integrated']:
            
                source = []
                target = []
            
                for dialogue in high: # high 데이터만 사용        
                    subsets = get_subsets(dialogue['dialogue'], window_size, last_speaker)
                    for subset in subsets: 
                        
                        # Source Text (Input)
                        source_text = f"Predict next {last_speaker}'s dialogue act: "

                        for utterance in subset[:-1]: # 마지막 utterance는 label로 사용
                            # Label
                            if include_label:
                                source_text += f"[{capitalize(utterance['speaker'])}: "
                                source_text += f"{utterance['label']}] "
                                
                            else:
                                source_text += f"[{capitalize(utterance['speaker'])}] "

                            # Utterance
                            source_text += utterance['utterance']
                            source_text += ' '
                        
                        source.append(source_text.strip())

                        # Target Text (Output)
                        target_text = f"[{capitalize(subset[-1]['speaker'])}: "
                        target_text += f"{subset[-1]['label']}]"

                        target.append(target_text)
                        
                df = pd.DataFrame({'source_text': source, 'target_text': target})
                
                # pred-client, inputlabel-False이면 coarse/finegrained 의미 없음 
                # (therapist label이 한 번도 등장 안 해서 똑같은 데이터임)
                if last_speaker == 'client' and include_label == False:
                    filename = f'AnnoMI_dataset-high_pred-{last_speaker}_window-{window_size}_inputlabel-{include_label}_none_{len(df)}.csv'
                else:
                    filename = f'AnnoMI_dataset-high_pred-{last_speaker}_window-{window_size}_inputlabel-{include_label}_{granularity}_{len(df)}.csv'

                if filename not in filename_list:
                    filename_list.append(filename)
                    df.to_csv(f't5_dataset/{filename}', index=False)
                    print(f'Saved {filename}')

Saved AnnoMI_dataset-high_pred-therapist_window-1_inputlabel-True_integrated_4346.csv
Saved AnnoMI_dataset-high_pred-therapist_window-1_inputlabel-False_integrated_4346.csv
Saved AnnoMI_dataset-high_pred-therapist_window-2_inputlabel-True_integrated_4329.csv
Saved AnnoMI_dataset-high_pred-therapist_window-2_inputlabel-False_integrated_4329.csv
Saved AnnoMI_dataset-high_pred-therapist_window-3_inputlabel-True_integrated_4236.csv
Saved AnnoMI_dataset-high_pred-therapist_window-3_inputlabel-False_integrated_4236.csv
Saved AnnoMI_dataset-high_pred-therapist_window-4_inputlabel-True_integrated_4219.csv
Saved AnnoMI_dataset-high_pred-therapist_window-4_inputlabel-False_integrated_4219.csv
Saved AnnoMI_dataset-high_pred-therapist_window-5_inputlabel-True_integrated_4126.csv
Saved AnnoMI_dataset-high_pred-therapist_window-5_inputlabel-False_integrated_4126.csv
Saved AnnoMI_dataset-high_pred-therapist_window-6_inputlabel-True_integrated_4109.csv
Saved AnnoMI_dataset-high_pred-therapist_window-6

In [17]:
len(filename_list)

32

## Window size에 따른 sequence length 확인
- Window size 8까지 가도 sequence length 512 넘어가는 거 생각보다 별로 없음 (별로 영향 안 미칠듯)

In [11]:
# filename_list = ['AnnoMI_dataset-high_pred-therapist_window-1_inputlabel-True_finegrained_4347.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-1_inputlabel-True_coarse_4347.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-1_inputlabel-False_finegrained_4347.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-1_inputlabel-False_coarse_4347.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-2_inputlabel-True_finegrained_4330.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-2_inputlabel-True_coarse_4330.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-2_inputlabel-False_finegrained_4330.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-2_inputlabel-False_coarse_4330.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-3_inputlabel-True_finegrained_4237.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-3_inputlabel-True_coarse_4237.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-3_inputlabel-False_finegrained_4237.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-3_inputlabel-False_coarse_4237.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-4_inputlabel-True_finegrained_4220.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-4_inputlabel-True_coarse_4220.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-4_inputlabel-False_finegrained_4220.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-4_inputlabel-False_coarse_4220.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-5_inputlabel-True_finegrained_4127.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-5_inputlabel-True_coarse_4127.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-5_inputlabel-False_finegrained_4127.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-5_inputlabel-False_coarse_4127.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-6_inputlabel-True_finegrained_4110.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-6_inputlabel-True_coarse_4110.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-6_inputlabel-False_finegrained_4110.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-6_inputlabel-False_coarse_4110.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-7_inputlabel-True_finegrained_4017.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-7_inputlabel-True_coarse_4017.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-7_inputlabel-False_finegrained_4017.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-7_inputlabel-False_coarse_4017.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-8_inputlabel-True_finegrained_4000.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-8_inputlabel-True_coarse_4000.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-8_inputlabel-False_finegrained_4000.csv',
#                  'AnnoMI_dataset-high_pred-therapist_window-8_inputlabel-False_coarse_4000.csv',
#                  'AnnoMI_dataset-high_pred-client_window-1_inputlabel-True_finegrained_4382.csv',
#                  'AnnoMI_dataset-high_pred-client_window-1_inputlabel-True_coarse_4382.csv',
#                  'AnnoMI_dataset-high_pred-client_window-1_inputlabel-False_none_4382.csv',
#                  'AnnoMI_dataset-high_pred-client_window-2_inputlabel-True_finegrained_4289.csv',
#                  'AnnoMI_dataset-high_pred-client_window-2_inputlabel-True_coarse_4289.csv',
#                  'AnnoMI_dataset-high_pred-client_window-2_inputlabel-False_none_4289.csv',
#                  'AnnoMI_dataset-high_pred-client_window-3_inputlabel-True_finegrained_4272.csv',
#                  'AnnoMI_dataset-high_pred-client_window-3_inputlabel-True_coarse_4272.csv',
#                  'AnnoMI_dataset-high_pred-client_window-3_inputlabel-False_none_4272.csv',
#                  'AnnoMI_dataset-high_pred-client_window-4_inputlabel-True_finegrained_4179.csv',
#                  'AnnoMI_dataset-high_pred-client_window-4_inputlabel-True_coarse_4179.csv',
#                  'AnnoMI_dataset-high_pred-client_window-4_inputlabel-False_none_4179.csv',
#                  'AnnoMI_dataset-high_pred-client_window-5_inputlabel-True_finegrained_4162.csv',
#                  'AnnoMI_dataset-high_pred-client_window-5_inputlabel-True_coarse_4162.csv',
#                  'AnnoMI_dataset-high_pred-client_window-5_inputlabel-False_none_4162.csv',
#                  'AnnoMI_dataset-high_pred-client_window-6_inputlabel-True_finegrained_4069.csv',
#                  'AnnoMI_dataset-high_pred-client_window-6_inputlabel-True_coarse_4069.csv',
#                  'AnnoMI_dataset-high_pred-client_window-6_inputlabel-False_none_4069.csv',
#                  'AnnoMI_dataset-high_pred-client_window-7_inputlabel-True_finegrained_4052.csv',
#                  'AnnoMI_dataset-high_pred-client_window-7_inputlabel-True_coarse_4052.csv',
#                  'AnnoMI_dataset-high_pred-client_window-7_inputlabel-False_none_4052.csv',
#                  'AnnoMI_dataset-high_pred-client_window-8_inputlabel-True_finegrained_3960.csv',
#                  'AnnoMI_dataset-high_pred-client_window-8_inputlabel-True_coarse_3960.csv',
#                  'AnnoMI_dataset-high_pred-client_window-8_inputlabel-False_none_3960.csv']

In [11]:
filename_list = ['AnnoMI_dataset-high_pred-therapist_window-1_inputlabel-True_integrated_4346.csv',
                 'AnnoMI_dataset-high_pred-therapist_window-1_inputlabel-False_integrated_4346.csv',
                 'AnnoMI_dataset-high_pred-therapist_window-2_inputlabel-True_integrated_4329.csv',
                 'AnnoMI_dataset-high_pred-therapist_window-2_inputlabel-False_integrated_4329.csv',
                 'AnnoMI_dataset-high_pred-therapist_window-3_inputlabel-True_integrated_4236.csv',
                 'AnnoMI_dataset-high_pred-therapist_window-3_inputlabel-False_integrated_4236.csv',
                 'AnnoMI_dataset-high_pred-therapist_window-4_inputlabel-True_integrated_4219.csv',
                 'AnnoMI_dataset-high_pred-therapist_window-4_inputlabel-False_integrated_4219.csv',
                 'AnnoMI_dataset-high_pred-therapist_window-5_inputlabel-True_integrated_4126.csv',
                 'AnnoMI_dataset-high_pred-therapist_window-5_inputlabel-False_integrated_4126.csv',
                 'AnnoMI_dataset-high_pred-therapist_window-6_inputlabel-True_integrated_4109.csv',
                 'AnnoMI_dataset-high_pred-therapist_window-6_inputlabel-False_integrated_4109.csv',
                 'AnnoMI_dataset-high_pred-therapist_window-7_inputlabel-True_integrated_4016.csv',
                 'AnnoMI_dataset-high_pred-therapist_window-7_inputlabel-False_integrated_4016.csv',
                 'AnnoMI_dataset-high_pred-therapist_window-8_inputlabel-True_integrated_3999.csv',
                 'AnnoMI_dataset-high_pred-therapist_window-8_inputlabel-False_integrated_3999.csv',
                 'AnnoMI_dataset-high_pred-client_window-1_inputlabel-True_integrated_4381.csv',
                 'AnnoMI_dataset-high_pred-client_window-1_inputlabel-False_none_4381.csv',
                 'AnnoMI_dataset-high_pred-client_window-2_inputlabel-True_integrated_4288.csv',
                 'AnnoMI_dataset-high_pred-client_window-2_inputlabel-False_none_4288.csv',
                 'AnnoMI_dataset-high_pred-client_window-3_inputlabel-True_integrated_4271.csv',
                 'AnnoMI_dataset-high_pred-client_window-3_inputlabel-False_none_4271.csv',
                 'AnnoMI_dataset-high_pred-client_window-4_inputlabel-True_integrated_4178.csv',
                 'AnnoMI_dataset-high_pred-client_window-4_inputlabel-False_none_4178.csv',
                 'AnnoMI_dataset-high_pred-client_window-5_inputlabel-True_integrated_4161.csv',
                 'AnnoMI_dataset-high_pred-client_window-5_inputlabel-False_none_4161.csv',
                 'AnnoMI_dataset-high_pred-client_window-6_inputlabel-True_integrated_4068.csv',
                 'AnnoMI_dataset-high_pred-client_window-6_inputlabel-False_none_4068.csv',
                 'AnnoMI_dataset-high_pred-client_window-7_inputlabel-True_integrated_4051.csv',
                 'AnnoMI_dataset-high_pred-client_window-7_inputlabel-False_none_4051.csv',
                 'AnnoMI_dataset-high_pred-client_window-8_inputlabel-True_integrated_3959.csv',
                 'AnnoMI_dataset-high_pred-client_window-8_inputlabel-False_none_3959.csv']

In [12]:
%%time
import numpy as np
import pandas as pd
from transformers import T5Tokenizer

def count_length(tokenizer, text):
    tokens = tokenizer(text, return_tensors='pt')
    sequence_length = tokens['input_ids'].size(1)
    
    return sequence_length

tokenizer = T5Tokenizer.from_pretrained('t5-base')

for filename in filename_list:
    df = pd.read_csv(f't5_dataset/{filename}')

    source_length = [count_length(tokenizer, text) for text in list(df['source_text'])]
    target_length = [count_length(tokenizer, text) for text in list(df['target_text'])]

    source_mean = np.mean(source_length)
    source_std_dev = np.std(source_length)
    source_over_512 = len([i for i in source_length if i > 512])

    target_mean = np.mean(target_length)
    target_std_dev = np.std(target_length)
    target_over_512 = len([i for i in target_length if i > 512])

    print(f"** {filename} **")
    print(f'[Source Text] Mean: {round(source_mean, 2)} | Std Dev: {round(source_std_dev, 2)} | Over 512: {source_over_512} out of {len(df)}')
    print(f'[Target Text] Mean: {round(target_mean, 2)} | Std Dev: {round(target_std_dev, 2)} | Over 512: {target_over_512} out of {len(df)}')
    print()


** AnnoMI_dataset-high_pred-therapist_window-1_inputlabel-True_integrated_4346.csv **
[Source Text] Mean: 41.75 | Std Dev: 29.04 | Over 512: 0 out of 4346
[Target Text] Mean: 8.75 | Std Dev: 0.44 | Over 512: 0 out of 4346

** AnnoMI_dataset-high_pred-therapist_window-1_inputlabel-False_integrated_4346.csv **
[Source Text] Mean: 39.0 | Std Dev: 29.12 | Over 512: 0 out of 4346
[Target Text] Mean: 8.75 | Std Dev: 0.44 | Over 512: 0 out of 4346



Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors


** AnnoMI_dataset-high_pred-therapist_window-2_inputlabel-True_integrated_4329.csv **
[Source Text] Mean: 73.25 | Std Dev: 40.58 | Over 512: 1 out of 4329
[Target Text] Mean: 8.75 | Std Dev: 0.44 | Over 512: 0 out of 4329

** AnnoMI_dataset-high_pred-therapist_window-2_inputlabel-False_integrated_4329.csv **
[Source Text] Mean: 67.04 | Std Dev: 40.52 | Over 512: 1 out of 4329
[Target Text] Mean: 8.75 | Std Dev: 0.44 | Over 512: 0 out of 4329

** AnnoMI_dataset-high_pred-therapist_window-3_inputlabel-True_integrated_4236.csv **
[Source Text] Mean: 103.89 | Std Dev: 52.1 | Over 512: 1 out of 4236
[Target Text] Mean: 8.75 | Std Dev: 0.44 | Over 512: 0 out of 4236

** AnnoMI_dataset-high_pred-therapist_window-3_inputlabel-False_integrated_4236.csv **
[Source Text] Mean: 94.93 | Std Dev: 52.07 | Over 512: 1 out of 4236
[Target Text] Mean: 8.75 | Std Dev: 0.44 | Over 512: 0 out of 4236

** AnnoMI_dataset-high_pred-therapist_window-4_inputlabel-True_integrated_4219.csv **
[Source Text] Mean: 