In [1]:
# 필요한 라이브러리 가져오기
import os
import json
import pandas as pd

# JSON 데이터 처리

In [2]:
def process_tokens(json_directory):
    data_for_tokenizer = []

    for filename in os.listdir(json_directory):
        if filename.endswith('.json'):
            with open(os.path.join(json_directory, filename), 'r', encoding='utf-8-sig') as file:
                json_data = json.load(file)

                # tokenizer 데이터 수집
                tokenizer_data = extract_tokenizer_data(json_data)
                data_for_tokenizer.extend(tokenizer_data)

    # tokenizer 데이터 처리
    df_tokenizer = pd.DataFrame(data_for_tokenizer, columns=['Region', 'Eojeol', 'Standard', 'IsDialect'])
    df_tokenizer.to_csv('tokens.csv', index=False, encoding='utf-8-sig')

    return f"Processed all token JSON files in {json_directory}"

In [3]:
def process_sentences(directory):
    pairs = []

    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8-sig') as file:
                data = json.load(file)

                # Read metadata file
                metadata = data['metadata']
                region = metadata['category'].split(' > ')[0][0:2]

                # Iterate through the file
                for utterance in data['utterance']:
                    pair = {
                        'region': region,
                        'dialect_form': utterance['dialect_form'],
                        'standard_form': utterance['standard_form']
                    }
                    pairs.append(pair)

    # Convert the list of dictionaries into a CSV format
    df = pd.DataFrame(pairs)
    csv_filename = 'dialect_standard_pairs.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8-sig')
    print(f"CSV file '{csv_filename}' has been created.")

In [4]:
def extract_tokenizer_data(json_data):
    rows = []
    metadata_present = 'metadata' in json_data and 'category' in json_data['metadata']

    for entry in json_data.values():
        if isinstance(entry, list):
            for item in entry:
                for eojeol_info in item.get('eojeolList', []):
                    eojeol = eojeol_info.get('eojeol', '')
                    standard = eojeol_info.get('standard', '')
                    isDialect = 1 if eojeol_info.get('isDialect', False) else 0
                    region = ''
                    if metadata_present:
                        category = json_data['metadata']['category']
                        region = category.split(' > ')[0][0:2]
                    rows.append([region, eojeol, standard, isDialect])
    return rows

In [5]:
directory = '/Users/taemmini/Projects/HUFS_Dialect_Analysis/Datasets'

In [6]:
process_tokens(directory)

'Processed all token JSON files in /Users/taemmini/Projects/HUFS_Dialect_Analysis/Datasets'

In [7]:
process_sentences(directory)

CSV file 'dialect_standard_pairs.csv' has been created.


- Preprocessing에서 전처리 예정