In [1]:
import json
from save_json_dataset import save_dataset

In [2]:
'''
This function processes a single data file in JSON format to extract and format information.
The function should need to be adjusted to fit the requirements of your project.
'''

def json_to_prompt(file_path):
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        dialogue = ''
        for utterance in data.get('utterances', ''):
            speaker = f"Speaker {utterance['speaker']}:"
            text = utterance['text']
            dialogue += f'{speaker} {text}'
            dialogue += '\n'
        
        urgency_level = data.get('urgencyLevel', '')
        disaster_large = data.get('disasterLarge', '')
        disaster_medium = data.get('disasterMedium', '')
        symptoms = data.get('symptom', [])
        triage = data.get('triage', '')

        row = {
            'prompt':f"""\
Task: Please label the following 119 emergency call conversation with urgency level, disease large classification, disease medium classification, symptoms, and triage.

Dialogue:
{dialogue}

Output:
Urgency Level: {urgency_level}
Disaster Large: {disaster_large}
Disaster Medium: {disaster_medium}
Symptoms: {symptoms}
Triage: {triage}
""",
            'urgencyLevel': urgency_level,
            'disasterLarge': disaster_large,
            'disasterMedium': disaster_medium,
            'symptoms': symptoms,
            'triage': triage
        }

        return row

    except Exception as e:
        print(f'Error processing file {file_path}: {e}')
        return None

In [3]:
'''
It is designed to handle the entire set of files assigned to a single process in the multiprocessing setup.
'''

def merge_sample(file_paths):
    data = [json_to_prompt(file_path) for file_path in file_paths]
    return data

In [4]:
dir_path = '/hdd/data/088.위급상황_음성-음향_고도화_119_지능형_신고접수_음성_인식_데이터/3.개방데이터/1.데이터/Training/2.라벨링데이터'
save_dataset(
    base_dir = dir_path,
    file_extension = '.json',
    processes = 12,
    save_path = 'prompts.json',
    process_function = merge_sample
)