In [2]:
import os
import sys
cwd = os.getcwd()

# Assuming the target directory is one level up from the current working directory
parent_dir = os.path.dirname(cwd)

# Add the parent directory to sys.path
sys.path.append(parent_dir)

from draco import Draco
import json
import shutil
import pandas as pd
from typing import Dict, List, Any
import pickle


import numpy as np
from sklearn import svm

import plotly.figure_factory as ff
import matplotlib.pyplot as plt
import matplotlib.colors as colors

from draco.data_utils import pairs_to_vec

In [3]:
default_draco = Draco()

In [4]:
from sklearn import svm
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

def train_model(data: pd.DataFrame, test_size: float = 0.3, C_values: list = [0.1, 0.5, 1, 5, 10], quiet=False):
    # Prepare the data
    X = data.negative - data.positive
    size = len(X)
    y = np.ones(size)

    # Flip a few examples at random
    idx = np.random.choice([False, True], size=size, p=[0.5, 0.5])
    X[idx] = -X[idx]
    y[idx] = -y[idx]

    # Split the data into train and dev sets
    X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=test_size)

    best_score = 0
    best_C = C_values[0]

    # Grid search on C
    for C in C_values:
        clf = svm.LinearSVC(C=C, fit_intercept=False)
        clf.fit(X_train, y_train)
        score = clf.score(X_dev, y_dev)

        if not quiet:
            print(f"C: {C}, Dev score: {score}")

        if score > best_score:
            best_score = score
            best_C = C

    # Retrain on the entire dataset with the best C
    clf_final = svm.LinearSVC(C=best_C, fit_intercept=False)
    clf_final.fit(X, y)

    if not quiet:
        print("Best C: ", best_C)
        print("Final Train Score: ", clf_final.score(X, y))

    return clf_final



In [5]:
def swap_json_fields(json_file_path, ranks):
    # Read the JSON file
   
    with open(json_file_path, 'r') as file:
        json_data = json.load(file)

    if len(json_data) != len(ranks):
        raise ValueError("Length of ranks does not match the number of JSON objects in the file")
    
    print("swapping and writing:",len(json_data), len(ranks))
    
    updated_json_data = []
    i = 1
    for json_obj, chart in zip(json_data, ranks):
        if chart == "chart 2":
            # Swap 'positive' and 'negative' fields
            # copy file f'./DracoGPT_data/kim2018/{i}_neg_first.txt' to f'./DracoGPT_data/kim2018_for_reader/agree/{i}.txt'
            source_file = f'./DracoGPT_data/kim2018/{i}_neg_first.txt'
            destination_file = f'./DracoGPT_data/kim2018_for_reader/disagree/{i}.txt'
            shutil.copyfile(source_file, destination_file)
            
            json_obj['positive'], json_obj['negative'] = json_obj['negative'], json_obj['positive']
            updated_json_data.append(json_obj)
        elif chart == "chart 1":
            # Keep the json_obj as is
            source_file = f'./DracoGPT_data/kim2018/{i}_pos_first.txt'
            destination_file = f'./DracoGPT_data/kim2018_for_reader/agree/{i}.txt'
            shutil.copyfile(source_file, destination_file)
            
            
            updated_json_data.append(json_obj)
        elif chart == "conflicting":
            source_file = f'./DracoGPT_data/kim2018/{i}_pos_first.txt'
            destination_file = f'./DracoGPT_data/kim2018_for_reader/conflict/{i}.txt'
            shutil.copyfile(source_file, destination_file)
            
            f'./DracoGPT_data/kim2018/{i}_pos_first.txt'
            i += 1
            continue
        else:
            # Raise an error for any other value
            raise ValueError(f"Invalid value in ranks: {chart}")
        i += 1
    print("final i", i)
    return updated_json_data


# json_file_path = f'./DracoGPT_data/kim2018_GPT/kim2018_draco2.json'
# len(swap_json_fields(json_file_path, res_kim))

In [6]:
import pickle
res_kim = None
def load_data(model, temp):
    kim_file_path = f'./gpt_responses/kim2018_{model}_{temp}_ranking.pkl'

    # Load the data from kim_GPT.pkl
    with open(kim_file_path, 'rb') as file:
        res_kim = pickle.load(file)


    # Combine the lists
    combined_list = res_kim
    return combined_list, res_kim

combined_list, res_kim = load_data('gpt-4-1106-preview', 0)
print(combined_list)

# Count occurrences
count_chart_1 = combined_list.count('chart 1')
count_chart_2 = combined_list.count('chart 2')
count_conflicting = combined_list.count('conflicting')

# Calculate total number of items for percentage calculation
total_items = len(combined_list)

# Calculate percentages
percentage_chart_1 = (count_chart_1 / total_items) * 100
percentage_chart_2 = (count_chart_2 / total_items) * 100
percentage_conflicting = (count_conflicting / total_items) * 100

# Print results
print(f"Percentage of 'chart 1': {percentage_chart_1}%")
print(f"Percentage of 'chart 2': {percentage_chart_2}%")
print(f"Percentage of 'conflicting': {percentage_conflicting}%")

['chart 2', 'chart 2', 'chart 2', 'chart 2', 'chart 2', 'chart 2', 'chart 2', 'conflicting', 'chart 1', 'chart 2', 'conflicting', 'chart 1', 'chart 1', 'chart 1', 'chart 1', 'chart 1', 'chart 1', 'chart 1', 'chart 1', 'conflicting', 'conflicting', 'conflicting', 'chart 1', 'chart 1', 'chart 1', 'chart 1', 'chart 1', 'chart 1', 'chart 1', 'chart 1', 'chart 1', 'chart 2', 'conflicting', 'chart 2', 'chart 2', 'conflicting', 'conflicting', 'chart 1', 'chart 2', 'conflicting', 'conflicting', 'chart 1', 'conflicting', 'chart 1', 'conflicting', 'chart 1', 'conflicting', 'conflicting', 'chart 1', 'conflicting', 'chart 1', 'chart 1', 'chart 1', 'chart 1', 'chart 1', 'chart 1', 'chart 2', 'conflicting', 'conflicting', 'conflicting', 'chart 1', 'chart 1', 'conflicting', 'chart 1', 'chart 1', 'chart 1', 'chart 1', 'chart 1', 'chart 1', 'conflicting', 'chart 1', 'chart 1', 'chart 1', 'chart 1', 'chart 1', 'conflicting', 'chart 2', 'chart 2', 'chart 2', 'chart 2', 'chart 2', 'chart 2', 'chart 2', 'c

In [7]:

def write_GPT_training_data(model, temp):
    json_file_path = './data/kim2018_draco2.json'
    updated_json = swap_json_fields(json_file_path, res_kim)
    directory = os.path.dirname(json_file_path)
    new_file_path = os.path.join(directory, f'modified_kim_{model}_{temp}.json')
    with open(new_file_path, 'w') as file:
        json.dump(updated_json, file, indent = 4)
    
#     json_file_path = f'./DracoGPT_data/saket2018_GPT/saket2018_draco2.json'
#     updated_json = swap_json_fields(json_file_path, res_saket)
#     directory = os.path.dirname(json_file_path)
#     new_file_path = os.path.join(directory, f'modified_saket_{model}_{temp}.json')
#     with open(new_file_path, 'w') as file:
#         json.dump(updated_json, file, indent = 4)

write_GPT_training_data('gpt-4-1106-preview', 0)

swapping and writing: 1152 1152


FileNotFoundError: [Errno 2] No such file or directory: './DracoGPT_data/kim2018/1_neg_first.txt'

In [7]:
filtered_combined_list = [element for element in combined_list if element in ['chart 1', 'chart 2']]


index_map = {}
filtered_index = 0

for original_index, value in enumerate(combined_list):
    if value in ["chart 1", "chart 2"]:
        index_map[filtered_index] = original_index
        filtered_index += 1

In [8]:
def train_data_prep(model, temp):
    kim = {}
#     saket = {}
#     with open(f'./DracoGPT_Data/saket2018_GPT/modified_saket_{model}_{temp}.json') as file:

#         i = 0
#         json_data = json.load(file)

#         for pair in json_data:
#             pair["source"] = "saket"
#             pair["pair_id"] = f'{pair["source"]}_{i}'
#             saket[pair["pair_id"]] = pair
#             i += 1


    with open(f'./DracoGPT_Data/kim2018_GPT/modified_kim_{model}_{temp}.json') as file:

        i = 0
        json_data = json.load(file)
        print(len(json_data))
        for pair in json_data:
            pair["source"] = "kim"
            pair["pair_id"] = f'{pair["source"]}_{index_map[i]}'
            kim[pair["pair_id"]] = pair
            i += 1

    combined = kim #| saket
    
    baseline_train_data = pairs_to_vec(combined)
    diff = baseline_train_data.positive - baseline_train_data.negative
    print(type(diff))
    for index, row in diff.iterrows():
        non_zero_columns = row[row != 0]
        number_index = index.rsplit('_', 1)[-1]
        if_agree = None
        outcome = ''
        if combined_list[int(number_index)] == 'chart 1':
            outcome = 'agree'
        elif combined_list[int(number_index)] == 'chart 2':
            outcome = 'disagree'
        else:
            raise ValueError("error")
        with open(f'./DracoGPT_data/kim2018_for_reader/{outcome}/{int(number_index)+1}.txt', 'a') as file:
            file.write('\n\n' + non_zero_columns.to_string())
        
    
    diff.to_csv("./DracoGPT_data/kim2018_for_reader/pos-neg.csv")
    
    
    assert set(baseline_train_data.negative.columns) == set(
        default_draco.soft_constraint_names
    ), "Feature names do not match."
    clf = train_model(baseline_train_data)

    features = baseline_train_data.negative.columns
    print(features)
    baseline_weights = {}

    with open(f'./DracoGPT_data/DracoGPT_weights/DracoGPT_{model}_{temp}_weights', 'a') as fout:
        for feature, weight in zip(features, clf.coef_[0]):
            baseline_weights[f"{feature}_weight"] = int(weight * 1000)
            fout.write(f"#const {feature}_weight = {int(weight * 1000)}.\n")
            
            
            
    '''
    Treat conflicting pairs
    '''
    combined = {}
    with open('./docs/applications/data/kim2018_draco2.json') as file:
        
        i = 1
        json_data = json.load(file)
        print(len(json_data),len(combined_list))
        for (pair, outcome) in zip(json_data, combined_list):
            if outcome == "conflicting":
                pair["source"] = "kim"
                pair["pair_id"] = f'{pair["source"]}_{i}'
                combined[pair["pair_id"]] = pair
            i += 1
    
    baseline_train_data = pairs_to_vec(combined)
    diff = baseline_train_data.positive - baseline_train_data.negative
#     print("conflicting diff dim:", diff)
    for index, row in diff.iterrows():
        non_zero_columns = row[row != 0]
        number_index = index.rsplit('_', 1)[-1]
        if_agree = None
        outcome = ''
        with open(f'./DracoGPT_data/kim2018_for_reader/conflict/{number_index}.txt', 'a') as file:
            file.write('\n\n' + non_zero_columns.to_string())



print("Training DracoGPT on Kim data")
train_data_prep('gpt-4-1106-preview', 0)

INFO:draco.data_utils:Running 85 partitions of 845 items in parallel on {processes} processes.


Training DracoGPT on Kim data
845


INFO:draco.data_utils:Hash of dataframe: -5562377976351535379


<class 'pandas.core.frame.DataFrame'>
C: 0.1, Dev score: 0.9881889763779528
C: 0.5, Dev score: 0.984251968503937
C: 1, Dev score: 0.984251968503937
C: 5, Dev score: 0.984251968503937
C: 10, Dev score: 0.984251968503937
Best C:  0.1


INFO:draco.data_utils:Running 31 partitions of 307 items in parallel on {processes} processes.


Final Train Score:  0.98698224852071
Index(['aggregate', 'aggregate_count', 'aggregate_group_by_raw',
       'aggregate_max', 'aggregate_mean', 'aggregate_median', 'aggregate_min',
       'aggregate_no_discrete', 'aggregate_stdev', 'aggregate_sum',
       ...
       'value_line', 'value_point', 'value_rect', 'value_text', 'value_tick',
       'x_col', 'x_row', 'x_y_raw', 'y_col', 'y_row'],
      dtype='object', length=147)
1152 1152


INFO:draco.data_utils:Hash of dataframe: -753024968425796386


In [9]:
kim = {}
# saket = {}
# with open(f'./DracoGPT_Data/saket2018_GPT/saket2018_draco2.json') as file:

#     i = 0
#     json_data = json.load(file)

#     for pair in json_data:
#         pair["source"] = "saket"
#         pair["pair_id"] = f'{pair["source"]}_{i}'
#         saket[pair["pair_id"]] = pair
#         i += 1


with open('./docs/applications/data/kim2018_draco2.json') as file:

    i = 0
    json_data = json.load(file)

    for pair in json_data:
        pair["source"] = "kim"
        pair["pair_id"] = f'{pair["source"]}_{i}'
        kim[pair["pair_id"]] = pair
        i += 1

combined = kim #| saket
    
baseline_train_data = pairs_to_vec(combined)
print(baseline_train_data.shape)

clf = train_model(baseline_train_data)

features = baseline_train_data.negative.columns
baseline_weights = {}

with open('./DracoGPT_data/DracoGPT_weights/original_weights', 'w') as fout:
    for feature, weight in zip(features, clf.coef_[0]):
        baseline_weights[f"{feature}_weight"] = int(weight * 1000)
        fout.write(f"#const {feature}_weight = {int(weight * 1000)}.\n")

INFO:draco.data_utils:Running 116 partitions of 1152 items in parallel on {processes} processes.
INFO:draco.data_utils:Hash of dataframe: -4999795338384740180


(1152, 296)
C: 0.1, Dev score: 0.9421965317919075
C: 0.5, Dev score: 0.9364161849710982
C: 1, Dev score: 0.9364161849710982
C: 5, Dev score: 0.9364161849710982
C: 10, Dev score: 0.9364161849710982
Best C:  0.1
Final Train Score:  0.9470486111111112




In [10]:
'''
Now we select the subset of data used to train dracogpt
'''

def filter_data(json_file_path, ranks):
    # Read the JSON file
    with open(json_file_path, 'r') as file:
        json_data = json.load(file)

    if len(json_data) != len(ranks):
        raise ValueError("Length of ranks does not match the number of JSON objects in the file")

    updated_json_data = []
    for json_obj, chart in zip(json_data, ranks):
        print(chart)
        if chart == "chart 2" or chart == 'chart 1':
            updated_json_data.append(json_obj)
        else:
            continue
    print(len(updated_json_data))
    return updated_json_data


def write_draco_subset_training_data(model, temp):
    json_file_path = './docs/applications/data/kim2018_draco2.json'
    updated_json = filter_data(json_file_path, res_kim)
    print(len(updated_json))
    directory = os.path.dirname(json_file_path)
    new_file_path = os.path.join(directory, f'subset_kim.json')
    with open(new_file_path, 'w') as file:
        json.dump(updated_json, file, indent = 4)
    
#     json_file_path = f'./DracoGPT_data/saket2018_GPT/saket2018_draco2.json'
#     updated_json = filter_data(json_file_path, res_saket)
#     print(len(updated_json))
#     directory = os.path.dirname(json_file_path)
#     new_file_path = os.path.join(directory, f'subset_saket.json')
#     with open(new_file_path, 'w') as file:
#         json.dump(updated_json, file, indent = 4)

write_draco_subset_training_data('gpt-4-1106-preview', 0)


chart 2
chart 2
chart 2
chart 2
chart 2
chart 2
chart 2
conflicting
chart 1
chart 2
conflicting
chart 1
chart 1
chart 1
chart 1
chart 1
chart 1
chart 1
chart 1
conflicting
conflicting
conflicting
chart 1
chart 1
chart 1
chart 1
chart 1
chart 1
chart 1
chart 1
chart 1
chart 2
conflicting
chart 2
chart 2
conflicting
conflicting
chart 1
chart 2
conflicting
conflicting
chart 1
conflicting
chart 1
conflicting
chart 1
conflicting
conflicting
chart 1
conflicting
chart 1
chart 1
chart 1
chart 1
chart 1
chart 1
chart 2
conflicting
conflicting
conflicting
chart 1
chart 1
conflicting
chart 1
chart 1
chart 1
chart 1
chart 1
chart 1
conflicting
chart 1
chart 1
chart 1
chart 1
chart 1
conflicting
chart 2
chart 2
chart 2
chart 2
chart 2
chart 2
chart 2
conflicting
conflicting
conflicting
chart 2
conflicting
chart 2
chart 2
chart 2
chart 2
chart 2
chart 2
chart 2
chart 2
chart 2
chart 1
conflicting
conflicting
conflicting
conflicting
conflicting
chart 2
conflicting
chart 1
conflicting
chart 1
chart 1


In [11]:
kim = {}
# saket = {}
# with open(f'./DracoGPT_Data/saket2018_GPT/subset_saket.json') as file:

#     i = 0
#     json_data = json.load(file)

#     for pair in json_data:
#         pair["source"] = "saket"
#         pair["pair_id"] = f'{pair["source"]}_{i}'
#         saket[pair["pair_id"]] = pair
#         i += 1


with open(f'./DracoGPT_Data/kim2018_GPT/subset_kim.json') as file:

    i = 0
    json_data = json.load(file)

    for pair in json_data:
        pair["source"] = "kim"
        pair["pair_id"] = f'{pair["source"]}_{i}'
        kim[pair["pair_id"]] = pair
        i += 1

combined = kim #| saket
    
baseline_train_data = pairs_to_vec(combined)
print(len(baseline_train_data))
clf = train_model(baseline_train_data)

features = baseline_train_data.negative.columns
baseline_weights = {}

with open(f'./DracoGPT_data/DracoGPT_weights/original_subset_weights', 'w') as fout:
    for feature, weight in zip(features, clf.coef_[0]):
        baseline_weights[f"{feature}_weight"] = int(weight * 1000)
        fout.write(f"#const {feature}_weight = {int(weight * 1000)}.\n")

INFO:draco.data_utils:Running 85 partitions of 845 items in parallel on {processes} processes.
INFO:draco.data_utils:Hash of dataframe: 1496549123444742688


845
C: 0.1, Dev score: 0.9291338582677166
C: 0.5, Dev score: 0.9251968503937008
C: 1, Dev score: 0.9212598425196851
C: 5, Dev score: 0.9251968503937008
C: 10, Dev score: 0.9291338582677166
Best C:  0.1
Final Train Score:  0.9491124260355029




In [12]:
# import json

# # Paths to the original and modified files
# # original_file_path = './DracoGPT_Data/saket2018_GPT/saket2018_draco2.json'
# # modified_file_path = './DracoGPT_Data/saket2018_GPT/modified_saket2018_draco2.json'

# original_file_path = './DracoGPT_Data/kim2018_GPT/kim2018_draco2.json'
# modified_file_path = './DracoGPT_Data/kim2018_GPT/modified_kim2018_draco2.json'

# # Read the original file
# with open(original_file_path, 'r') as file:
#     original_data = json.load(file)

# # Read the modified file
# with open(modified_file_path, 'r') as file:
#     modified_data = json.load(file)

# # Ensure both files have the same number of elements
# if len(original_data) != len(modified_data):
#     print("Files have different number of elements.")
# else:
#     # Compare each element
#     for original, modified in zip(original_data, modified_data):
#         print(original == modified)
