In [1]:
import json
import os
import altair as alt

def transform_to_custom_spec(vega_lite_spec):
    vega_lite_json = json.loads(vega_lite_spec)
    
    if "spec" in vega_lite_json:
        if "encoding" in vega_lite_json["spec"]:
            spec_contents = vega_lite_json.pop("spec")
            vega_lite_json.update(spec_contents)
        else:
            raise ValueError('The "spec" field must contain an "encoding" field.')

    # Check and process "facet"
    if "facet" in vega_lite_json:
        if "row" in vega_lite_json["facet"] or "column" in vega_lite_json["facet"]:
            if "row" in vega_lite_json["facet"]:
                vega_lite_json["encoding"]["row"] = vega_lite_json["facet"]["row"]
            if "column" in vega_lite_json["facet"]:
                vega_lite_json["encoding"]["column"] = vega_lite_json["facet"]["column"]
            del vega_lite_json["facet"]
        else:
            raise ValueError('The "facet" field must contain either "row" or "column".')

    custom_spec = {"view": []}

    # Set up the view item
    view_item = {"coordinates": "cartesian", "mark": [], "scale": []}


    # Extract and handle the mark type
    mark = vega_lite_json.get("mark", {})
    mark_type = mark if isinstance(mark, str) else mark.get("type", None)

    if mark_type:
        mark_item = {"type": mark_type, "encoding": []}
        view_item["mark"].append(mark_item)
        
    

    # Handle encoding and scales
    encoding = vega_lite_json.get("encoding", {})
    encoded_fields = set()
    
    if len(encoding) > 3:
        raise ValueError(f"Encoding has length {len(encoding)}")

    for channel, details in encoding.items():
        if (details["field"] in ("q1", "q2") and details["type"] != "quantitative")\
            or (details["field"] == "n" and details["type"] not in ("ordinal", "nominal")):
            raise TypeError(f'Wrong variable type {details["type"]} for {details["field"]}')
            
        
        if channel not in ("x", "y", "color", "size", "row", "column", "shape"):
            raise ValueError(f"Encountered unrecognized channel: {channel}")
            
        if channel in ("row", "column"):
            f = details.get("field", "")
            if f == "":
                raise ValueError("can't find variable to facet on")
            view_item["facet"] = []
            view_item["facet"].append({"channel": channel, "field": f})
            encoded_fields.add(f)
            continue
            
            
        if mark_type:
            try:
                channel_item = {"channel": channel, "field": details["field"]}
            except Exception as e:
                raise ValueError(e)
            mark_item["encoding"].append(channel_item)
            encoded_fields.add(details["field"])

        # Determine scale type
        var_type = details["type"]
        scale_type = details.get("scale", {}).get("type", "")

        scale_item = {"channel": channel, "type": scale_type}
        if var_type == "quantitative":
            if scale_type not in ("", "linear"):
                raise ValueError(f"Invalid scale type for {var_type} type.")
            scale_item = {"channel": channel, "type": "linear"}
            zero = str(details.get("scale", {}).get("zero", True)).lower()
            # modified
            if zero == "true":
                scale_item["zero"] = zero
        elif var_type in ("ordinal", "nominal"):
            if "zero" in details.get("scale", {}):
                raise ValueError(f"'zero' field is not valid for {var_type} type.")
            if scale_type not in ("", "categorical", "ordinal"):
                raise ValueError(f"Invalid scale type for {var_type} type.")

            if channel == "color":
                scale_item = {"channel": channel, "type": "categorical"}
            elif channel in ("x", "y", "shape"):
                scale_item = {"channel": channel, "type": "ordinal"}
            

        view_item["scale"].append(scale_item)

    # Check if all required fields are present
    required_fields = {'q1', 'q2', 'n'}
    missing_fields = required_fields - encoded_fields
    if missing_fields:
        raise ValueError(f"Required field(s) not encoded: {', '.join(missing_fields)}")

    # Add the view item to custom spec
    custom_spec["view"].append(view_item)

    return json.dumps(custom_spec, indent=4)

# # Example Vega-Lite spec
# vega_lite_spec = '''
# {
#     "description" : { "id" : 0 },
#     "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
#     "mark": "point",
#     "encoding": {
#       "color": {"field": "q1", "type": "quantitative"},
#       "y": {"field": "q2", "type": "quantitative"},
#       "x": {"field": "n", "type": "nominal"}
#     }
# }
# '''

# # Transform and print the custom chart spec
# try:
#     custom_chart_spec = transform_to_custom_spec(vega_lite_spec)
#     print(custom_chart_spec)
# except ValueError as e:
#     print(f"Error: {e}")


In [2]:
def extract_json_from_file(file_path):
    with open(file_path, 'r') as file:
        text = file.read()

    # Find the position of the last 'content:'
    last_content_pos = text.rfind('content:')
    if last_content_pos == -1:
        print("No 'content:' found in the file.")
        return None

    # Find the position of the first '{' after the last 'content:'
    first_brace_pos = text.find('{', last_content_pos)

    if first_brace_pos == -1:
        print("No '{' found after the last 'content:'.")
        return None

    # Find the position of the last '}' in the file
    last_brace_pos = text.rfind('}')
    
    if last_brace_pos == -1:
        print("No closing '}' found in the file.")
        return None

    # Extract the JSON string
    json_str = text[first_brace_pos:last_brace_pos + 1]
    # Parse the JSON string into a Python object
    try:
        json_obj = json.loads(json_str)
        return json_obj
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        return None

In [3]:
def extract_required_fields(num):
    file_path = f"./common_partial_spec/kim2018/{num}.txt"

    # Check if the file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    # Read and parse the JSON file
    with open(file_path, 'r') as file:
        content = json.load(file)

    # Required fields
    required_fields = ['task', 'number_rows', 'field']

    # Dictionary to store the required fields and their values
    fields_values = {}

    # Check if each required field is present and add to the dictionary
    for field in required_fields:
        if field not in content:
            raise ValueError(f"Required field '{field}' not found in the file {file_path}")
        fields_values[field] = content[field]

    # Return the dictionary of required fields and the entire content
    return fields_values, content

In [4]:
def compare_dicts(d1, d2):
    if d1.keys() != d2.keys():
        return False

    for key in d1:
        if isinstance(d1[key], list) and isinstance(d2[key], list):
            # If lists, sort or convert to sets before comparing
            if all(isinstance(item, dict) for item in d1[key]):
                # If list of dicts, recursively compare
                if not all(any(compare_dicts(item1, item2) for item2 in d2[key]) for item1 in d1[key]):
                    return False
            elif set(d1[key]) != set(d2[key]):
                return False
        elif isinstance(d1[key], dict) and isinstance(d2[key], dict):
            # If dicts, recursively compare
            if not compare_dicts(d1[key], d2[key]):
                return False
        elif d1[key] != d2[key]:
            return False

    return True

In [5]:
def compare_json_lists(list1, list2):
    """
    Compare two lists of JSON objects, returning the common elements.
    """
    common = []
    for item1 in list1:
        for item2 in list2:
            if item1 == item2:
                common.append(item1)
                break
    return common
    

def extract_common_chart_properties(chart1, chart2):
    """
    Extract common chart properties from two chart specifications.
    """
    if chart1['task'] != chart2['task'] or chart1['number_rows'] != chart2['number_rows']:
        raise ValueError("Task and number of rows must be the same in both charts")

    if 'field' in chart1 and 'field' in chart2:
        if chart1['field'] != chart2['field']:
            raise ValueError("Fields must be the same in both charts")

    common_properties = {
        'task': chart1['task'],
        'number_rows': chart1['number_rows'],
        'field': chart1['field']
    }

    if chart1['view'][0]['coordinates'] == chart2['view'][0]['coordinates']:
        common_properties['view'] = {'coordinates': chart1['view'][0]['coordinates']}
    
    if chart1['view'][0]['mark'][0]['type'] == chart2['view'][0]['mark'][0]['type']:
        common_properties['view']['mark'] = [{'type': chart1['view'][0]['mark'][0]['type']}]

    
    encoding_common = compare_json_lists(chart1['view'][0]['mark'][0].get('encoding', []), chart2['view'][0]['mark'][0].get('encoding', []))
    if encoding_common:
        common_properties['view']['mark'][0] = common_properties['view']['mark'][0] | {"encoding":encoding_common}

    scale_common = compare_json_lists(chart1['view'][0].get('scale', []), chart2['view'][0].get('scale', []))
    if scale_common:
        common_properties['view']['scale'] = scale_common

    if 'facet' in chart1['view'][0] and 'facet' in chart2['view'][0] and chart1['view'][0]['facet'] == chart2['view'][0]['facet']:
        common_properties['view']['facet'] = chart1['view'][0]['facet']

    return common_properties

def are_json_objects_equivalent(json_str1, json_str2):
    try:
        # Parse the JSON strings into Python dictionaries
        dict1 = json.loads(json_str1)
        dict2 = json.loads(json_str2)

        # Compare the dictionaries
        return dict1 == dict2
    except json.JSONDecodeError:
        # If either string is not valid JSON, return False
        print("JSONDecodeError")
        return False



In [6]:
'''
This block extracts the vl code from gpt response and writes to another folder
'''
def write_vl_to_dict_results(source_dir, destination_dir):
    bad_response = 0
    successful_writes = 0
    # Create the destination directory if it doesn't exist
    os.makedirs(destination_dir, exist_ok=True)
    # Iterate through each file in the source directory
    for filename in os.listdir(source_dir):
        # Check if the file is a .txt file
        if filename.endswith('.txt'):
            #first get data properties

            underscore_index = filename.find('_')

            # Check if underscore was found
            if underscore_index == -1:
                raise ValueError("no _ found")

            # Extract the substring to the left of the underscore
            left_substring = filename[:underscore_index]

            summary_stats, common_partial_spec = extract_required_fields(left_substring)
            common_partial_spec["view"] = [common_partial_spec["view"]]





            source_file = os.path.join(source_dir, filename)
            destination_file = os.path.join(destination_dir, filename)

            content = extract_json_from_file(source_file)["chart"]


            content_str = str(content).replace("'", '"')
            content_str = content_str.replace("True", '"true"').replace("False", '"false"').replace("None", '"None"')


            try:
                custom_chart_spec = transform_to_custom_spec(content_str)
                custom_chart_spec = {**summary_stats, **(json.loads(custom_chart_spec))}

                common = extract_common_chart_properties(custom_chart_spec, common_partial_spec)
                common["view"] = [common["view"]]

                # inject fake data to check if valid vl spec
                spec_with_data = content
                spec_with_data['data'] = {"values": {"q1": 1, "q2": 2, "n": "N"}}
                alt.Chart.from_dict(spec_with_data)

                if not compare_dicts(common, common_partial_spec):
                    bad_response += 1
                    print(filename, "partial spec violated")
                    with open(destination_file, 'w') as file:
                        file.write("error")
                else:
                    with open(destination_file, 'w') as file:
                        file.write(str(custom_chart_spec).replace("'", '"'))
                    successful_writes += 1
                
                
            except Exception as e:
                bad_response += 1
                print(filename, f"Error: {str(e)[:30]}")
                with open(destination_file, 'w') as file:
                    file.write("error")
                    
    print("Valid Rate:", successful_writes/(successful_writes+bad_response))
    print("Bad response count:", bad_response)
    print("Successful writes:", successful_writes)
    print("TXT files processed successfully.")


In [7]:
source_dir = './gpt_responses/kim2018_responses/gpt-4-0125-preview_0'
destination_dir = './to_dict_gpt_responses/kim2018_responses/gpt-4-0125-preview_0'
write_vl_to_dict_results(source_dir, destination_dir)

964_gpt-4-0125-preview_0.txt Error: Required field(s) not encoded:
214_gpt-4-0125-preview_0.txt Error: Required field(s) not encoded:
553_gpt-4-0125-preview_0.txt Error: Required field(s) not encoded:
348_gpt-4-0125-preview_0.txt Error: Required field(s) not encoded:
59_gpt-4-0125-preview_0.txt Error: Required field(s) not encoded:
803_gpt-4-0125-preview_0.txt partial spec violated
218_gpt-4-0125-preview_0.txt Error: Required field(s) not encoded:
582_gpt-4-0125-preview_0.txt Error: Required field(s) not encoded:
935_gpt-4-0125-preview_0.txt Error: Encoding has length 4
235_gpt-4-0125-preview_0.txt Error: Encoding has length 4
877_gpt-4-0125-preview_0.txt Error: Required field(s) not encoded:
231_gpt-4-0125-preview_0.txt Error: Encoding has length 4
29_gpt-4-0125-preview_0.txt partial spec violated
990_gpt-4-0125-preview_0.txt Error: Required field(s) not encoded:
411_gpt-4-0125-preview_0.txt Error: Required field(s) not encoded:
105_gpt-4-0125-preview_0.txt Error: Encoding has length 

In [8]:
source_dir = './gpt_responses/kim2018_responses/gpt-4-0613_0'
destination_dir = './to_dict_gpt_responses/kim2018_responses/gpt-4-0613_0'
write_vl_to_dict_results(source_dir, destination_dir)

660_gpt-4-0613_0.txt Error: Encoding has length 4
582_gpt-4-0613_0.txt Error: 'zero' field is not valid for 
564_gpt-4-0613_0.txt Error: Encoding has length 4
97_gpt-4-0613_0.txt Error: Required field(s) not encoded:
427_gpt-4-0613_0.txt partial spec violated
699_gpt-4-0613_0.txt Error: 'zero' field is not valid for 
845_gpt-4-0613_0.txt Error: 'zero' field is not valid for 
208_gpt-4-0613_0.txt Error: Required field(s) not encoded:
132_gpt-4-0613_0.txt Error: 'zero' field is not valid for 
1031_gpt-4-0613_0.txt Error: Encoding has length 4
1054_gpt-4-0613_0.txt Error: 'zero' field is not valid for 
1075_gpt-4-0613_0.txt Error: 'zero' field is not valid for 
449_gpt-4-0613_0.txt partial spec violated
477_gpt-4-0613_0.txt partial spec violated
977_gpt-4-0613_0.txt Error: Encoding has length 4
310_gpt-4-0613_0.txt Error: Required field(s) not encoded:
749_gpt-4-0613_0.txt Error: Required field(s) not encoded:
194_gpt-4-0613_0.txt Error: Encoding has length 4
1014_gpt-4-0613_0.txt Error: 

In [9]:
source_dir = './gpt_responses/kim2018_responses/gpt-3.5-turbo-0125_0'
destination_dir = './to_dict_gpt_responses/kim2018_responses/gpt-3.5-turbo-0125_0'
write_vl_to_dict_results(source_dir, destination_dir)

683_gpt-3.5-turbo-0125_0.txt Error: Additional properties are not 
119_gpt-3.5-turbo-0125_0.txt Error: Additional properties are not 
1106_gpt-3.5-turbo-0125_0.txt Error: Additional properties are not 
702_gpt-3.5-turbo-0125_0.txt Error: Additional properties are not 
735_gpt-3.5-turbo-0125_0.txt Error: Additional properties are not 
140_gpt-3.5-turbo-0125_0.txt Error: Additional properties are not 
1087_gpt-3.5-turbo-0125_0.txt Error: Additional properties are not 
630_gpt-3.5-turbo-0125_0.txt Error: Additional properties are not 
607_gpt-3.5-turbo-0125_0.txt Error: Additional properties are not 
39_gpt-3.5-turbo-0125_0.txt Error: Additional properties are not 
1003_gpt-3.5-turbo-0125_0.txt Error: Additional properties are not 
669_gpt-3.5-turbo-0125_0.txt Error: Additional properties are not 
221_gpt-3.5-turbo-0125_0.txt Error: Additional properties are not 
454_gpt-3.5-turbo-0125_0.txt Error: Additional properties are not 
951_gpt-3.5-turbo-0125_0.txt Error: Additional properties ar