In [119]:
import pandas as pd
import numpy as np
import json
import os

In [120]:
LLM_MODEL_TO_REWRITE = "gemini"

In [121]:
def get_data(file_name):
    with open(f"{file_name}.json") as f:
        data = json.load(f)
    return data


def extract_text_from(item, count_incomplete):
    if LLM_MODEL_TO_REWRITE == "gpt":
        return item, count_incomplete
    if "\\boxed{" not in item:
        if "**" in item and LLM_MODEL_TO_REWRITE == "gemini":
            return item.replace("**", ""), count_incomplete
        else:
            return None, count_incomplete
    item_after_boxed = item.split("\\boxed{")[1]
    if "}" not in item_after_boxed:
        count_incomplete += 1
    item_after_boxed = item_after_boxed.split("}")[0]

    return item_after_boxed, count_incomplete


def store_updated_data(file_name, new_data):
    with open(f"{file_name}_cleaned.json", "w") as f:
        json.dump(new_data, f, indent=4)


def get_updated_results(data):
    keys_with_errors = []
    results = {}
    count_incomplete = 0
    for key, item in data.items():
        try:
            extracted, count_incomplete = extract_text_from(item, count_incomplete)

            if extracted is None:
                keys_with_errors.append(key)
            else:
                results[key] = extracted
        except Exception as e:
            print("Error with key", key)
            print(e)
            break
    print("Number of items in data:", len(data))
    print("Number of items in results:", len(results))
    print("Number of keys with errors:", len(keys_with_errors))
    print("Number of incomplete items:", count_incomplete)
    print("-----------------------------------")

    return results, keys_with_errors


def run(file_name):
    data = get_data(file_name)
    results, keys_with_errors = get_updated_results(data)
    store_updated_data(file_name, results)
    return data, results, keys_with_errors

In [122]:
for dir in ["wassa_individual"]:
    print("Extracting the data from", dir)
    files = [file for file in os.listdir(dir) if f"{LLM_MODEL_TO_REWRITE}.json" in file]
    for file in files:
        print(file)
        file_name = file.split(".")[0]
        file_name = os.path.join(dir, file_name)
        data, results, keys_with_errors = run(file_name)
    print("-" * 50)

Extracting the data from wassa_individual
wassa_rewritten_rephrase_gemini.json
Number of items in data: 567
Number of items in results: 567
Number of keys with errors: 0
Number of incomplete items: 14
-----------------------------------
wassa_rewritten_syntax_grammar_gemini.json
Number of items in data: 557
Number of items in results: 557
Number of keys with errors: 0
Number of incomplete items: 27
-----------------------------------
--------------------------------------------------
