In [None]:
import json
import re
import os

In [1]:
def extract_clean_answer(response_text):
    """
    для моделей chatgpt,
    для llama2 сработает, но не идеально
    """
    if not response_text:
        return None

    # Ищем P1
    match = re.search(r'\(([A-E])\)\.\s*(Very Accurate|Moderately Accurate|Neither Accurate Nor Inaccurate|Moderately Inaccurate|Very Inaccurate)', response_text, flags=re.IGNORECASE)
    if match:
        letter = match.group(1)
        text = match.group(2)
        return f"({letter}). {text}"

    # Ищем P2
    match = re.search(r'([1-5])\s*=\s*(Very much like me|Like me|Neither like nor unlike me|Not like me|Not like me at all)', response_text, flags=re.IGNORECASE)
    if match:
        number = match.group(1)
        text = match.group(2)
        return f"{number} = {text}"

    # Ищем P3
    match = re.fullmatch(r'\s*([1-5])\s*', response_text)
    if match:
        return match.group(1)

    # Если длинный текст
    all_matches = re.findall(r'\(([A-E])\)\s*(Very Accurate|Moderately Accurate|Neither Accurate Nor Inaccurate|Moderately Inaccurate|Very Inaccurate)', response_text, flags=re.IGNORECASE)
    if all_matches:
        letter, text = all_matches[0]
        return f"({letter}). {text}"

    return None

In [None]:
def extract_clean_answer(response_text):
    """
    для моделей llama2,
    потому что они оч удивильные ребята
    """

    if not response_text:
        return None

    # Убираем эмодзи и странные символы
    response_text = re.sub(r'[^\w\s\(\)\.\=\-/]', '', response_text)

    # Приводим к нижнему регистру
    text = response_text.lower()

    # Пытаемся найти такие ответы 2/5, 4/5, 5/5 для P2 или P3
    match = re.search(r'\b([1-5])\s*/\s*5\b', text)
    if match:
        return match.group(1)

    #llama2 не всегда отвечала именно по шаблонам, поэтому ловим все...возможное как она реагировала, чтобы не упустить ничего

    # (A). Very Accurate (P1)
    match = re.search(r'\(([a-e])\)\.\s*(very accurate|moderately accurate|neither accurate nor inaccurate|moderately inaccurate|very inaccurate)', text)
    if match:
        return f"({match.group(1).upper()}). {match.group(2).capitalize()}"

    # (A) Very Accurate (P1)
    match = re.search(r'\(([a-e])\)\s*(very accurate|moderately accurate|neither accurate nor inaccurate|moderately inaccurate|very inaccurate)', text)
    if match:
        return f"({match.group(1).upper()}). {match.group(2).capitalize()}"

    # A. Very Accurate (P1)
    match = re.search(r'\b([a-e])\.\s*(very accurate|moderately accurate|neither accurate nor inaccurate|moderately inaccurate|very inaccurate)', text)
    if match:
        return f"({match.group(1).upper()}). {match.group(2).capitalize()}"

    # (A) - Very Accurate (P1)
    match = re.search(r'option\s*\(?([a-e])\)?\s*-\s*(very accurate|moderately accurate|neither accurate nor inaccurate|moderately inaccurate|very inaccurate)', text)
    if match:
        return f"({match.group(1).upper()}). {match.group(2).capitalize()}"

    # 1 = Very much like me (P2)
    match = re.search(r'([1-5])\s*=\s*(very much like me|like me|neither like nor unlike me|not like me|not like me at all)', text)
    if match:
        return f"{match.group(1)} = {match.group(2).capitalize()}"

    # 1 - Very much like me (P2)
    match = re.search(r'([1-5])\s*-\s*(very much like me|like me|neither like nor unlike me|not like me|not like me at all)', text)
    if match:
        return f"{match.group(1)} = {match.group(2).capitalize()}"

    # Score: 5 (P3)
    match = re.search(r'score[:\s]+([1-5])', text)
    if match:
        return match.group(1)

    # scored: 5 (P3)
    match = re.search(r'scored[:\s]+([1-5])', text)
    if match:
        return match.group(1)

    # (5) (P3)
    match = re.search(r'\(([1-5])\)', text)
    if match:
        return match.group(1)

    # Agree (5) (P3)
    match = re.search(r'agree\s*\(([1-5])\)', text)
    if match:
        return match.group(1)

    # Просто ловим число 1-5 в ответе модели (P3)
    match = re.search(r'\b([1-5])\b', text)
    if match:
        return match.group(1)

    # very accurate (P1)
    match = re.search(r'(very accurate|moderately accurate|neither accurate nor inaccurate|moderately inaccurate|very inaccurate)', text)
    if match:
        return match.group(1).capitalize()

    return None

In [None]:
def create_clean_files(source_files, target_folder):
    """
    Сохраняем файлы толкько со значением label и processed_response
    """
    os.makedirs(target_folder, exist_ok=True)

    for src_file in source_files:
        with open(src_file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        output_data = []

        if isinstance(data, dict) and 'responses' in data:
            responses = data['responses']
        elif isinstance(data, list):
            responses = []
            for entry in data:
                responses.extend(entry.get('responses', []))
        else:
            print(f"Неизвестный формат данных в {src_file}")
            continue

        for response in responses:
            entry = {
                'label': response.get('label'),
                'processed_response': response.get('processed_response')
            }
            output_data.append(entry)

        base_name = os.path.basename(src_file)
        name_without_ext = os.path.splitext(base_name)[0]

        with open(os.path.join(target_folder, f'{name_without_ext}.json'), 'w', encoding='utf-8') as f:
            json.dump(output_data, f, ensure_ascii=False, indent=2)


In [None]:
def clean_processed_responses(folder_path):
    """
    Проходимся по всем json файлам в папке
    Чистим processed_response в каждой записи
    """
    for file_name in os.listdir(folder_path):
        if not file_name.endswith('.json'):
            continue

        file_path = os.path.join(folder_path, file_name)

        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        updated_data = []

        for entry in data:
            label = entry.get('label')
            processed_response = entry.get('processed_response')
            clean_response = extract_clean_answer(processed_response)

            if clean_response is not None:
                updated_entry = {
                    'label': label,
                    'processed_response': clean_response
                }
                updated_data.append(updated_entry)
            else:
                print(f"Warning: Couldn't parse response: {processed_response}")
                updated_entry = {
                    'label': label,
                    'processed_response': None
                }
                updated_data.append(updated_entry)

        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(updated_data, f, ensure_ascii=False, indent=2)

source_files = [
    'название_P1o.json',
    'название_P1R.json',
    'название_P2o.json',
    'название_P2R.json',
    'название_P3o.json',
    'название_P3R.json'
]

target_folder = 'cleaned_data'

# label + processed_response
create_clean_files(source_files, target_folder)

# чистка
clean_processed_responses(target_folder)
