# AnnoMI의 Gold Label & MI Classifier로 예측한 Label 통합
- 가상환경 t5

In [1]:
import json

with open('../preprocessed_data/AnnoMI-full_v4.0.json', 'r') as f:
    full = json.load(f)
with open('../preprocessed_data/AnnoMI-full_v4.0_high.json', 'r') as f:
    high = json.load(f)
with open('../preprocessed_data/AnnoMI-full_v4.0_low.json', 'r') as f:
    low = json.load(f)

In [2]:
print(len(full))
print(len(high))
print(len(low))

133
110
23


## Label별 개수 EDA

In [3]:
# AnnoMI의 gold label
from collections import Counter

labels = []

for dialogue in high:
    for utterance in dialogue['dialogue']:
        if utterance['speaker'] == 'therapist':
            for label in utterance[f"{utterance['speaker']}_label"]:
                labels.append(label)
                
print(len(labels))

counter = Counter(labels).most_common()
counter

4440


[('other', 1462),
 ('question_open', 726),
 ('reflection_simple', 642),
 ('reflection_complex', 624),
 ('question_closed', 521),
 ('input_information', 319),
 ('input_advice', 69),
 ('input_negotiation', 50),
 ('input_options', 27)]

In [21]:
# MI Classifier로 예측한 label
from collections import Counter

labels = []

for dialogue in high:
    for utterance in dialogue['dialogue']:
        if utterance['speaker'] == 'therapist':
            labels.append(utterance['predicted_label'])
                
print(len(labels))

counter = Counter(labels).most_common()
counter

4440


[('Other', 1233),
 ('Open Question', 831),
 ('Closed Question', 595),
 ('Give Information', 448),
 ('Complex Reflection', 362),
 ('Simple Reflection', 322),
 ('Affirm', 270),
 ('Advise without Permission', 117),
 ('Support', 87),
 ('Confront', 57),
 ('Self-Disclose', 40),
 ('Emphasize Autonomy', 33),
 ('Direct', 29),
 ('Advise with Permission', 12),
 ('Warn', 4)]

## AnnoMI의 Gold Label & MI Classifier로 예측한 Label 통합
- AnnoMI의 gold label을 우선순위로
- 그 다음 MI classifier predicted label 중 선택한 일부 label들
- 나머지는 other

In [3]:
# 변환된 이름은 최대한 MITI 기준으로
label_name_integrated = {
    
    # Therapist (AnnoMI Gold Label)
    'reflection_simple': 'Simple Reflection',
    'reflection_complex': 'Complex Reflection',
    'question_open': 'Open Question',
    'question_closed': 'Closed Question',
    'input_advice': 'Advise',
    'input_information': 'Give Information',
    
    # Therapist (MI Classifier로 예측한 Label)
    'Affirm': 'Affirm',
    'Give Information': 'Give Information',
    'Advise with Permission': 'Advise',
    'Advise without Permission': 'Advise',

    # Client (Change & Not Change의 Binary Classification으로)
    'change': 'Change',
    'neutral': 'Not Change',
    'sustain': 'Not Change'
    
}

def capitalize(s):
    if not s:
        return s  # Return the original string if it's empty
    return s[0].upper() + s[1:]

In [4]:
# full
for dialogue in full:
    for utterance in dialogue['dialogue']:
        
        # Therapist
        if utterance['speaker'] == 'therapist':
            if utterance['therapist_label'][0] in ['question_closed', 'question_open', 'reflection_simple', 'reflection_complex', 'input_information', 'input_advice']:
                utterance['label'] = label_name_integrated[utterance['therapist_label'][0]]
            elif utterance['predicted_label'] in ['Affirm', 'Give Information', 'Advise with Permission', 'Advise without Permission']:
                utterance['label'] = label_name_integrated[utterance['predicted_label']]
            else:
                utterance['label'] = 'Other'
                
            del utterance['therapist_label']
            del utterance['predicted_label']
            del utterance['converted_label']
        
        # Client
        else:
            utterance['label'] = label_name_integrated[utterance['client_label'][0]]

            del utterance['client_label']

In [5]:
# high
for dialogue in high:
    for utterance in dialogue['dialogue']:
        
        # Therapist
        if utterance['speaker'] == 'therapist':
            if utterance['therapist_label'][0] in ['question_closed', 'question_open', 'reflection_simple', 'reflection_complex', 'input_information', 'input_advice']:
                utterance['label'] = label_name_integrated[utterance['therapist_label'][0]]
            elif utterance['predicted_label'] in ['Affirm', 'Give Information', 'Advise with Permission', 'Advise without Permission']:
                utterance['label'] = label_name_integrated[utterance['predicted_label']]
            else:
                utterance['label'] = 'Other'
                
            del utterance['therapist_label']
            del utterance['predicted_label']
            del utterance['converted_label']
        
        # Client
        else:
            utterance['label'] = label_name_integrated[utterance['client_label'][0]]

            del utterance['client_label']

In [6]:
# low
for dialogue in low:
    for utterance in dialogue['dialogue']:
        
        # Therapist
        if utterance['speaker'] == 'therapist':
            if utterance['therapist_label'][0] in ['question_closed', 'question_open', 'reflection_simple', 'reflection_complex', 'input_information', 'input_advice']:
                utterance['label'] = label_name_integrated[utterance['therapist_label'][0]]
            elif utterance['predicted_label'] in ['Affirm', 'Give Information', 'Advise with Permission', 'Advise without Permission']:
                utterance['label'] = label_name_integrated[utterance['predicted_label']]
            else:
                utterance['label'] = 'Other'
                
            del utterance['therapist_label']
            del utterance['predicted_label']
            del utterance['converted_label']
        
        # Client
        else:
            utterance['label'] = label_name_integrated[utterance['client_label'][0]]

            del utterance['client_label']

In [7]:
print(len(full))
print(len(high))
print(len(low))

133
110
23


In [8]:
with open('../preprocessed_data/AnnoMI-full_v5.0.json', 'w') as f:
    json.dump(full, f, indent='\t')
with open('../preprocessed_data/AnnoMI-full_v5.0_high.json', 'w') as f:
    json.dump(high, f, indent='\t')
with open('../preprocessed_data/AnnoMI-full_v5.0_low.json', 'w') as f:
    json.dump(low, f, indent='\t')

## 통합한 데이터셋 Label별 개수 EDA

In [9]:
import json

with open('../preprocessed_data/AnnoMI-full_v5.0.json', 'r') as f:
    full = json.load(f)
with open('../preprocessed_data/AnnoMI-full_v5.0_high.json', 'r') as f:
    high = json.load(f)
with open('../preprocessed_data/AnnoMI-full_v5.0_low.json', 'r') as f:
    low = json.load(f)

In [10]:
# full
from collections import Counter

labels = []

for dialogue in full:
    for utterance in dialogue['dialogue']:
        if utterance['speaker'] == 'therapist':
            labels.append(utterance['label'])
                
print(len(labels))

counter = Counter(labels).most_common()
counter

4881


[('Other', 1407),
 ('Open Question', 821),
 ('Simple Reflection', 660),
 ('Complex Reflection', 637),
 ('Closed Question', 567),
 ('Give Information', 457),
 ('Affirm', 166),
 ('Advise', 166)]

In [11]:
# high
from collections import Counter

labels = []

for dialogue in high:
    for utterance in dialogue['dialogue']:
        if utterance['speaker'] == 'therapist':
            labels.append(utterance['label'])
                
print(len(labels))

counter = Counter(labels).most_common()
counter

4440


[('Other', 1305),
 ('Open Question', 726),
 ('Simple Reflection', 642),
 ('Complex Reflection', 624),
 ('Closed Question', 521),
 ('Give Information', 380),
 ('Affirm', 151),
 ('Advise', 91)]

In [12]:
# low
from collections import Counter

labels = []

for dialogue in low:
    for utterance in dialogue['dialogue']:
        if utterance['speaker'] == 'therapist':
            labels.append(utterance['label'])
                
print(len(labels))

counter = Counter(labels).most_common()
counter

441


[('Other', 102),
 ('Open Question', 95),
 ('Give Information', 77),
 ('Advise', 75),
 ('Closed Question', 46),
 ('Simple Reflection', 18),
 ('Affirm', 15),
 ('Complex Reflection', 13)]