In [1]:
import json
import os
import sys
import numpy as np


# Handles serialization of common numpy datatypes
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)


def md(markdown: str):
    display(Markdown(markdown))


def pprint(obj):
    md(f"```json\n{json.dumps(obj, indent=2, cls=NpEncoder)}\n```")

In [2]:
cwd = os.getcwd()

# Assuming the target directory is one level up from the current working directory
parent_dir = os.path.dirname(cwd)

# Add the parent directory to sys.path
sys.path.append(parent_dir)

# Now you can import your custom package
import draco 
from draco import dict_to_facts
from draco.data_utils import pairs_to_vec
import pandas as pd

In [3]:
import clingo

def parse_fact_to_symbol(fact_str):
    # Parses a string fact and returns a clingo.Symbol
    return clingo.parse_term(fact_str.rstrip('.'))

def convert_list_to_symbols(fact_list):
    # Converts a list of string facts to an Iterable[Symbol]
    return [parse_fact_to_symbol(fact) for fact in fact_list]

In [4]:
def compare_dicts(a, b):
    """Compare two dictionaries, treating lists as unordered."""
    
    if not isinstance(a, dict) or not isinstance(b, dict) or len(a) != len(b):
        return False
    
    for key in a:
        if key not in b:
            return False
        
        if isinstance(a[key], list) and isinstance(b[key], list):
            if not compare_lists(a[key], b[key]):
                return False
        elif isinstance(a[key], dict) and isinstance(b[key], dict):
            if not compare_dicts(a[key], b[key]):
                return False
        else:
            if a[key] != b[key]:
                return False
    
    return True

def compare_lists(a, b):
    """Compare two lists, ignoring order."""
    if len(a) != len(b):
        return False
    
    a_sorted = sorted(a, key=custom_sort_key)
    b_sorted = sorted(b, key=custom_sort_key)
    
    for a_item, b_item in zip(a_sorted, b_sorted):
        if isinstance(a_item, dict) and isinstance(b_item, dict):
            if not compare_dicts(a_item, b_item):
                return False
        else:
            if a_item != b_item:
                return False
    
    return True

def custom_sort_key(item):
    """Generate a sort key for items that can be dictionaries."""
    if isinstance(item, dict):
        return tuple(sorted((k, custom_sort_key(v)) for k, v in item.items()))
    return item

In [5]:
# check how to 
check = '["attribute(task,root,value).", "attribute(number_rows,root,30).", "entity(field,root,0).", "attribute((field,name),0,n).", "attribute((field,type),0,string).", "attribute((field,entropy),0,1000).", "attribute((field,unique),0,10).", "entity(field,root,1).", "attribute((field,name),1,q1).", "attribute((field,type),1,number).", "attribute((field,entropy),1,1305).", "attribute((field,unique),1,30).", "attribute((field,interesting),1,true).", "entity(field,root,2).", "attribute((field,name),2,q2).", "attribute((field,type),2,number).", "attribute((field,entropy),2,1299).", "attribute((field,unique),2,30).", "entity(view,root,3).", "attribute((view,coordinates),3,cartesian).", "entity(mark,3,4).", "attribute((mark,type),4,point).", "entity(encoding,4,5).", "attribute((encoding,channel),5,x).", "attribute((encoding,field),5,n).", "entity(encoding,4,6).", "attribute((encoding,channel),6,y).", "attribute((encoding,field),6,q2).", "entity(encoding,4,7).", "attribute((encoding,channel),7,color).", "attribute((encoding,field),7,q1).", "entity(scale,3,8).", "attribute((scale,channel),8,x).", "attribute((scale,type),8,ordinal).", "entity(scale,3,9).", "attribute((scale,channel),9,y).", "attribute((scale,type),9,).", "entity(scale,3,10).", "attribute((scale,channel),10,color).", "attribute((scale,type),10,linear).", "attribute((scale,zero),10,true)."], "negative": ["attribute(task,root,value).", "attribute(number_rows,root,30).", "entity(field,root,0).", "attribute((field,name),0,n).", "attribute((field,type),0,string).", "attribute((field,entropy),0,1000).", "attribute((field,unique),0,10).", "entity(field,root,1).", "attribute((field,name),1,q1).", "attribute((field,type),1,number).", "attribute((field,entropy),1,1305).", "attribute((field,unique),1,30).", "attribute((field,interesting),1,true).", "entity(field,root,2).", "attribute((field,name),2,q2).", "attribute((field,type),2,number).", "attribute((field,entropy),2,1299).", "attribute((field,unique),2,30).", "entity(view,root,3).", "attribute((view,coordinates),3,cartesian).", "entity(mark,3,4).", "attribute((mark,type),4,point).", "entity(encoding,4,5).", "attribute((encoding,channel),5,size).", "attribute((encoding,field),5,q1).", "entity(encoding,4,6).", "attribute((encoding,channel),6,y).", "attribute((encoding,field),6,q2).", "entity(encoding,4,7).", "attribute((encoding,channel),7,x).", "attribute((encoding,field),7,n).", "entity(scale,3,8).", "attribute((scale,channel),8,size).", "attribute((scale,type),8,linear).", "attribute((scale,zero),8,true).", "entity(scale,3,9).", "attribute((scale,channel),9,y).", "attribute((scale,type),9,linear).", "attribute((scale,zero),9,true).", "entity(scale,3,10).", "attribute((scale,channel),10,x).", "attribute((scale,type),10,ordinal)."]'
check = check.replace('"', "'")

def write_dicts_to_json_training_file(model, temperature):
    with open(training_data_path, 'w') as training_file:
        training_file.write("[\n")


    removed, err = 0, 0

    for i, filename in enumerate(os.listdir(optimal_completion_dir)):

        underscore_index = filename.find('_')
        if underscore_index == -1:
            raise ValueError("no _ found")
        num = filename[:underscore_index]

        with open(os.path.join(original_pairs_dir, f"{num}_pos_first.txt"), 'r') as file:
            dicts_to_compare = []
            pairs = []
            for line in file:
                #check if line is empty line. correct code
                if line == "\n":
                    continue
                chart = line.strip()  # Remove leading/trailing whitespace, including newlines
                data_facts = json.loads(chart[9:])["field"]
                pairs.append(dict_to_facts(json.loads(chart[9:])))
                dicts_to_compare.append(json.loads(chart[9:]))
#                 print(json.loads(chart[9:]))
#                 return
    #             break

        if filename.endswith('.txt'):
            filename = os.path.join(optimal_completion_dir, filename)

            with open(filename, 'r') as file:
                content = file.read()
                if content == "error":
                    err += 1
                    continue

                task = None
                if "value" in content:
                    task = "value"
                elif "summary" in content:
                    task = "summary"
                else:
                    raise ValueError("wrong task type")
                    
               
                    
                content_json = json.loads(content)
#                 print(type(content_json), pairs[0])
                content_json["field"] = data_facts
                
                positive_facts = dict_to_facts(content_json)
#                 flag = False
#                 if {'channel': 'x', 'field': 'q1'} in content_json["view"][0]["mark"][0]["encoding"] and\
#                     {'channel': 'y', 'field': 'q2'} in content_json["view"][0]["mark"][0]["encoding"] and\
#                     {'channel': 'color', 'field': 'n'} in content_json["view"][0]["mark"][0]["encoding"]:
#                     flag = True
# #                     print(content_json["view"][0]["mark"][0]["encoding"])
# #                     print(dicts_to_compare[0]["view"][0]["mark"][0]["encoding"])
# #                     print(dicts_to_compare[1]["view"][0]["mark"][0]["encoding"])
#                     print(content_json)
#                     print(dicts_to_compare[0])
#                     print(dicts_to_compare[1])
#                     print(content_json["view"] == dicts_to_compare[0]["view"] or content_json["view"] == dicts_to_compare[1]["view"])
            
                if compare_dicts(dicts_to_compare[0], content_json):
                    removed += 1
                    del pairs[0]
                if compare_dicts(dicts_to_compare[1], content_json):
                    removed += 1
                    del pairs[1]

                    
                    
                with open(training_data_path, 'a') as training_file:
                    for j, pair in enumerate(pairs[:2]):
                        data_to_append = json.dumps({
                            "task": task,
                            "positive": positive_facts,
                            "negative": pair,
                            "significant": ""
                        })
                        training_file.write(data_to_append + ",\n")
                        
#                         os.makedirs(os.path.join(gpt_responses_dir, "kim2018_for_reader", f"{model}_{temperature}"), exist_ok=True)
#                         print(num)
#                         #write to individual txt files and show diff in feature vectors
#                         with open(os.path.join(gpt_responses_dir, "kim2018_for_reader", f"{model}_{temperature}", f"{num}.txt"), 'w') as f:
#                             f.write(str(content))
#                             f.write("\n")
#                             f.write(str(json.loads(chart[9:])))

#                             temp = {}
#                             pair = json.loads(data_to_append)
#                             pair["source"] = "kim"
#                             pair["pair_id"] = "kim_0"
#                             temp["kim_0"] = pair

#                             baseline_train_data = pairs_to_vec(temp)
#                             diff = baseline_train_data.positive - baseline_train_data.negative
#                             non_zero_columns = diff.iloc[0][diff.iloc[0] != 0]
#                             f.write('\n\n' + non_zero_columns.to_string() + "\n\n")
#                             print(num, non_zero_columns.to_string())



    with open(training_data_path, 'a') as training_file:
        training_file.write("\n]")                    
                    
            
    percent = removed / (2 * (len(os.listdir(optimal_completion_dir)) - err))
    print(f"Total {removed} chart pairs removed, {percent} removed out of {2 * (len(os.listdir(optimal_completion_dir)) - err)}")

In [7]:
gpt_responses_dir = "./gpt_responses"
optimal_completion_dir = "./to_dict_gpt_responses/kim2018_responses/gpt-4-0125-preview_0"
original_pairs_dir = "../rank/data/example_pairs_to_rank/kim2018"
training_data_path = "./training_data/kim2018/gpt-4-0125-preview_0_training.json"
write_dicts_to_json_training_file("gpt-4-0125-preview", 0)
try:
    # Read the content of the file
    
    with open(training_data_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Finding the last comma in the file
    last_comma_index = content.rfind(',')

    # Removing the last comma
    if last_comma_index != -1:
        content = content[:last_comma_index] + content[last_comma_index + 1:]
    
    # Writing the corrected content back to the file
    with open(training_data_path, 'w', encoding='utf-8') as file:
        file.write(content)

    print("The file has been successfully corrected and saved.")
except Exception as e:
    print(f"An error occurred: {e}")

Total 474 chart pairs removed, 0.22170252572497662 removed out of 2138
The file has been successfully corrected and saved.


In [8]:
gpt_responses_dir = "./gpt_responses"
optimal_completion_dir = "./to_dict_gpt_responses/kim2018_responses/altair_gpt-4-0125-preview_0"
original_pairs_dir = "../rank/data/example_pairs_to_rank/kim2018"
training_data_path = "./training_data/kim2018/altair_gpt-4-0125-preview_0_training.json"
write_dicts_to_json_training_file("altair_gpt-4-0125-preview", 0)
try:
    # Read the content of the file
    
    with open(training_data_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Finding the last comma in the file
    last_comma_index = content.rfind(',')

    # Removing the last comma
    if last_comma_index != -1:
        content = content[:last_comma_index] + content[last_comma_index + 1:]
    
    # Writing the corrected content back to the file
    with open(training_data_path, 'w', encoding='utf-8') as file:
        file.write(content)

    print("The file has been successfully corrected and saved.")
except Exception as e:
    print(f"An error occurred: {e}")

Total 504 chart pairs removed, 0.22971741112123975 removed out of 2194
The file has been successfully corrected and saved.


In [9]:
gpt_responses_dir = "./gpt_responses"
optimal_completion_dir = "./to_dict_gpt_responses/kim2018_responses/gpt-4-0613_0"
original_pairs_dir = "../rank/data/example_pairs_to_rank/kim2018"
training_data_path = "./training_data/kim2018/gpt-4-0613_0_training.json"
write_dicts_to_json_training_file("gpt-4-0613",0)
try:
    # Read the content of the file
    
    with open(training_data_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Finding the last comma in the file
    last_comma_index = content.rfind(',')

    # Removing the last comma
    if last_comma_index != -1:
        content = content[:last_comma_index] + content[last_comma_index + 1:]
    
    # Writing the corrected content back to the file
    with open(training_data_path, 'w', encoding='utf-8') as file:
        file.write(content)

    print("The file has been successfully corrected and saved.")
except Exception as e:
    print(f"An error occurred: {e}")

Total 464 chart pairs removed, 0.22700587084148727 removed out of 2044
The file has been successfully corrected and saved.


In [10]:
gpt_responses_dir = "./gpt_responses"
optimal_completion_dir = "./to_dict_gpt_responses/kim2018_responses/altair_gpt-4-0613_0"
original_pairs_dir = "../rank/data/example_pairs_to_rank/kim2018"
training_data_path = "./training_data/kim2018/altair_gpt-4-0613_0_training.json"
write_dicts_to_json_training_file("altair_gpt-4-0613", 0)
try:
    # Read the content of the file
    
    with open(training_data_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Finding the last comma in the file
    last_comma_index = content.rfind(',')

    # Removing the last comma
    if last_comma_index != -1:
        content = content[:last_comma_index] + content[last_comma_index + 1:]
    
    # Writing the corrected content back to the file
    with open(training_data_path, 'w', encoding='utf-8') as file:
        file.write(content)

    print("The file has been successfully corrected and saved.")
except Exception as e:
    print(f"An error occurred: {e}")

Total 452 chart pairs removed, 0.23203285420944558 removed out of 1948
The file has been successfully corrected and saved.


In [11]:
gpt_responses_dir = "./gpt_responses"
optimal_completion_dir = "./to_dict_gpt_responses/kim2018_responses/gpt-3.5-turbo-0125_0"
original_pairs_dir = "../rank/data/example_pairs_to_rank/kim2018"
training_data_path = "./training_data/kim2018/gpt-3.5-turbo-0125_0_training.json"
write_dicts_to_json_training_file("gpt-3.5-turbo-0125", 0)
try:
    # Read the content of the file
    
    with open(training_data_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Finding the last comma in the file
    last_comma_index = content.rfind(',')

    # Removing the last comma
    if last_comma_index != -1:
        content = content[:last_comma_index] + content[last_comma_index + 1:]
    
    # Writing the corrected content back to the file
    with open(training_data_path, 'w', encoding='utf-8') as file:
        file.write(content)

    print("The file has been successfully corrected and saved.")
except Exception as e:
    print(f"An error occurred: {e}")

Total 47 chart pairs removed, 0.10681818181818181 removed out of 440
The file has been successfully corrected and saved.


In [12]:
file_path = "./training_data/kim2018/gpt-4-1106-preview_0_training.json"

try:
    with open(file_path, 'r') as file:
        # Attempt to load the JSON content
        data = json.load(file)
    print("The file contains valid JSON.")
except json.JSONDecodeError as e:
    print("The file contains invalid JSON.")
    print("Error details:", e)


The file contains valid JSON.
