In [None]:
import os
import pickle
import numpy as np
from statsmodels.stats.proportion import proportion_confint
import copy
import shutil
import json
from utils import *

In [None]:
RESULTS_DIR = 'results'
model2dir = {}
for folder in os.listdir(RESULTS_DIR):
    model2dir[folder] = os.path.join(RESULTS_DIR, folder)
    print(f"{folder} has {len(os.listdir(f'{RESULTS_DIR}/{folder}'))} certificates")
len(model2dir)

In [110]:
MIN_CERTIFICATES = 10
remove_models = []
for model, dir in model2dir.items():
    if len(os.listdir(dir)) < MIN_CERTIFICATES:
        print(model)
        remove_models.append(model)

for model in remove_models:
    del model2dir[model]

In [None]:
#Certifications

model2certificates = {}
NUM_SAMPLES = 250
for model, dir in model2dir.items():
    model2certificates[model] = {}
    for cert in os.listdir(dir):
        if not cert.endswith('.pkl'):
            continue
        cert_path = os.path.join(dir, cert)
        experiment_results = pickle.load(open(cert_path, 'rb'))
        if len(experiment_results) == 3: #if we store (detailed_results, correct answers num , total queries num)
            experiment_results = experiment_results[0]
        correct = 0
        total = 0
        for i, result in enumerate(experiment_results):
            if i >= NUM_SAMPLES:
                break
            correct += result['result'][0]
            total += 1
        (lower, upper) = proportion_confint(correct, total, alpha=0.05, method='beta')
        model2certificates[model][cert] = (correct, total, lower, upper, correct/total)

#Print summary

model_result_summary = {}
for model, certificates in model2certificates.items():
    print(f"Model: {model}")
    certificates_lower = [cert[2] for cert in certificates.values()]
    certificates_upper = [cert[3] for cert in certificates.values()]
    certificates_acc = [cert[4] for cert in certificates.values()]
    # print(certificates_acc)
    print(f" Mean lower bound: {np.mean(certificates_lower)}, Mean upper bound: {np.mean(certificates_upper)}, Mean accuracy: {np.mean(certificates_acc)}")
    print(f"Std lower bound: {np.std(certificates_lower)}, Std upper bound: {np.std(certificates_upper)}, Sd accuracy: {np.std(certificates_acc)}")
    # print(f"Min lower bound: {np.min(certificates_lower)}, Min upper bound: {np.min(certificates_upper)}, Min accuracy: {np.min(certificates_acc)}")
    # print(f"Max lower bound: {np.max(certificates_lower)}, Max upper bound: {np.max(certificates_upper)}, Max accuracy: {np.max(certificates_acc)}")
    model_result_summary[model] = (np.mean(certificates_lower), np.mean(certificates_upper), np.mean(certificates_acc), np.std(certificates_lower), np.std(certificates_upper), np.std(certificates_acc))

#Save summary
SAVE_FILE = 'model_summary.json'
json.dump(model_result_summary, open(SAVE_FILE, 'w'), indent=4)

In [None]:
import numpy as np
from collections import defaultdict

# Group results by base model
model_groups = defaultdict(lambda: defaultdict(list))

for model_name, stats in model_result_summary.items():
    # Parse model name components
    parts = model_name.split('_')
    
    # Extract model family and size
    if 'phi' in model_name:
        model_family = 'Phi-3'
        size = parts[1].upper().replace("B", "B)")
    elif 'llama' in model_name:
        model_family = 'Llama'
        size = parts[1].upper().replace("B", "B)")
    elif 'mistral' in model_name:
        model_family = 'Mistral'
        size = parts[1].upper().replace("B", "B)")
    else:
        continue
    
    # Extract precision and specification
    if '4bit' in model_name:
        precision = '4bit'
        spec = parts[-1]
    else:  # fp16 case
        precision = 'fp16' 
        spec = parts[-1]
    
    # Format specification name
    spec = spec.replace('_', ' ').title()
    print(spec)
    
    # Store results with model grouping key
    key = f"{model_family} ({size}"
    model_groups[key][precision].append({
        'spec': spec,
        'lower': f"{np.round(stats[0], 2):.2f} \pm {np.round(stats[3], 2):.2f}",
        'upper': f"{np.round(stats[1], 2):.2f} \pm {np.round(stats[4], 2):.2f}",
        'acc': f"{np.round(stats[2], 2):.2f} \pm {np.round(stats[5], 2):.2f}"
    })

# Generate LaTeX table rows
latex_rows = []
spec_order = ['Vanilla', 'Distractor']

for model, precisions in model_groups.items():
    num_rows = sum(len(specs) for specs in precisions.values())
    latex_rows.append(rf"\multirow{{{num_rows}}}{{*}}{{\makecell{{{model}}}}}")
    
    for precision, specs in precisions.items():
        sorted_specs = sorted(specs, key=lambda x: spec_order.index(x['spec']))
        
        for i, spec in enumerate(sorted_specs):
            baseline = ''
            if spec['spec'] == 'Shuffle':
                baseline = '$todo \pm todo$'  # Update baseline values from your data
                
            row = [
                rf"& \multirow{{3}}{{*}}{{{precision}}}" if i == 0 else "&",
                f"& {baseline}" if baseline else "& -",
                f"& {spec['spec']}",
                f"& ${spec['lower']}$",
                f"& ${spec['upper']}$",
                f"& ${spec['acc']}$ \\\\"
            ]
            latex_rows.append(' '.join(row))
        
        if precision != list(precisions.keys())[-1]:
            latex_rows.append(r"\cline{2-7}\noalign{\smallskip}")

    latex_rows.append(r"\midrule")

# Final table assembly
latex_table = r"""
\begin{table*}[!tb]
\caption{Certification Results for Different LLMs}
\vspace{0.1cm}
\centering
\scriptsize{
\begin{tabular}{@{} p{1cm} ccclrrr@{}}
\toprule
Model & Precision & Baseline & Specification Kind & \makecell{Avg. \\ Lower Bound} & \makecell{Avg. \\ Upper Bound} & \makecell{Avg. \\ Accuracy} \\
\midrule
""" + "\n".join(latex_rows) + r"""
\bottomrule
\end{tabular}}
\label{tab:certificates}
\end{table*}
"""

print(latex_table)

In [8]:
#We may need this for more detailed analysis
from utils import load_aliases
prime_folder = 'quacer_c_prime'
actual_rels = json.load(open(os.path.join(prime_folder, 'actual_rels.json')))
edge2src = json.load(open(os.path.join(prime_folder, 'edge2src.json')))
graph = json.load(open(os.path.join(prime_folder, 'graph.json')))
id2name = json.load(open(os.path.join(prime_folder, 'id2name.json')))
id2source = json.load(open(os.path.join(prime_folder, 'id2source.json')))
rels = json.load(open(os.path.join(prime_folder, 'rels.json')))
graph_text_edge = json.load(open(os.path.join(prime_folder, 'graph_text_edge.json')))
entity_aliases = load_aliases(os.path.join(prime_folder, 'entity_aliases.txt'))
relations_aliases = load_aliases(os.path.join(prime_folder, 'relation_aliases.txt'))

In [None]:
model_files = {k:[] for k in model2dir.keys()}
results = {k:[] for k in model2dir.keys()}
results_data = {k:[] for k in model2dir.keys()}
all_entities = []
results_ids = {k:{} for k in model2dir.keys()}
question_answers_models = {}
model2ks = {}
NUM_SAMPLES = 250
for model, dir in model2dir.items():
    ks = set()
    for file in os.listdir(dir):
        try:
            experiment_results = pickle.load(open(os.path.join(dir, file), 'rb'))
        except:
            print(os.path.join(dir, file))
            continue
        correct = 0
        total = 0
        idx = file.index('.')
        all_entities.append(file[:idx+1])
        model_files[model].append(file[:idx+1])
        unique_queries = {}
        repeat_queries = 0
        unique_paths = set()
        if len(experiment_results) < 10:
            experiment_results = experiment_results[0]
        for i, result in enumerate(experiment_results):
            if i >= NUM_SAMPLES:
                break
            if type(result) == str:
                print(result)
                print(os.path.join(dir, file))
                raise ValueError('Error in file: ', file)
            if result['result'][0] == 1:
                correct += 1
            if result['question'] not in unique_queries:
                unique_queries[result['question']] = []
            path = tuple(result['path_id'])
            ks.add(len(path)-1)
            unique_paths.add(path)
            certi = file[:idx+1]
            if certi not in question_answers_models:
                question_answers_models[certi]= {}
            
            if model not in question_answers_models[certi]:
                question_answers_models[certi][model] = []
            if 'distractor' in result:
                question_answers_models[certi][model].append({'query': result['question'], 'eval': result['result'][0], 
                                                         'context': result['context'], 'model_answer':result['model_answer'], 
                                                         'correct_answers':result['correct_answers'], 'correct_ids':result['correct_ids'],
                                                         'answer_options':result['options'], 'correct_ans_num':result['correct_ans_num'], 
                                                         'distractor':result['distractor'], 'path_id':result['path_id']})
            else:
                question_answers_models[certi][model].append({'query': result['question'], 'eval': result['result'][0], 
                                                         'context': result['context'], 'model_answer':result['model_answer'], 
                                                         'correct_answers':result['correct_answers'], 'correct_ids':result['correct_ids'],
                                                         'answer_options':result['options'], 'correct_ans_num':result['correct_ans_num'], 'path_id':result['path_id']})
            unique_queries[result['question']].append(result['result'][0])
            total += 1
        if total != NUM_SAMPLES:
            raise ValueError('Error in file total: ', file, total)
        results_ids[model][file[:idx+1]] = (correct, total, correct/total)
        results[model].append(proportion_confint(correct, total, method='beta'))
        repeat_queries = total - len(unique_queries)
        same_query_accuracy = 0
        count_same = 0
        for query in unique_queries:
            if len(unique_queries[query]) > 1:
                if np.mean(unique_queries[query]) > 0:
                    same_query_accuracy += np.mean(unique_queries[query])
                    count_same += 1
        if count_same == 0:
            same_query_accuracy = 1.0
        else:
            same_query_accuracy = same_query_accuracy/count_same
        results_data[model].append((correct, total, len(unique_queries), repeat_queries, same_query_accuracy))
    model2ks[model] = ks
print("total number of subgraphs: ", {key:len(value)for key, value in results.items()}, 
      "\nmean lower bounds per subgraph: ", {key:(np.mean([v[0] for v in value]), np.std([v[0] for v in value])) for key, value in results.items()}, 
      "\nmean upper bounds per subgraph: ", {key:(np.mean([v[1] for v in value]), np.std([v[1] for v in value])) for key, value in results.items()},
      '\nmean total questions per subgraph: ', {key:(np.mean([v[1] for v in value]), np.std([v[1] for v in value])) for key, value in results_data.items()}, 
      "\nmean correct answers per subgraph: ", {key:(np.mean([v[0] for v in results_data[key]]), np.std([v[1] for v in results_data[key]])) for key in results_data.keys()},
      "\nmean accuracy per subgraph: ", {key:(np.mean([v[0]/v[1] for v in value]), np.std([v[0]/v[1] for v in value])) for key, value in results_data.items()},
      "\nmean unique queries per subgraph: ", {key:(np.mean([v[2] for v in value]), np.std([v[2] for v in value])) for key, value in results_data.items()},
      "\nmean repeat queries per subgraph: ", {key:(np.mean([v[3] for v in value]), np.std([v[3] for v in value])) for key, value in results_data.items()},
      "\nmean same query accuracy per subgraph: ", {key:(np.mean([v[4] for v in value]), np.std([v[4] for v in value])) for key, value in results_data.items()})

In [None]:
import pandas as pd
import random

# Assuming 'question_answers_models' is defined and contains your data
rows = []
model_columns = []
for certi, models in question_answers_models.items():
    # Iterate over each model and their entries
    for model_id, entries in models.items():
        # Process each entry for the current model
        for entry in entries:
            path_id = entry['path_id']
            path_names = [id2name[x] for x in path_id]
            row = {'path_id': path_id, 'path_names':path_names, 'certi':certi}
            # Include query, context, and correct answers
            sampled_values = {'query': entry['query'], 'correct_answers': entry['correct_answers'], 'answer_options':entry['answer_options'], 
                            'correct_ans_num':entry['correct_ans_num'], 'context': entry['context']}
            row.update(sampled_values)

            # Include model's answer and its evaluation
            model_answer_col = 'model_answer'
            model_eval_col = 'model_eval'
            row['model_id'] = model_id
            row['correct_ids'] = entry['correct_ids']
            row[model_answer_col] = entry['model_answer']
            row[model_eval_col] = entry['eval']
            row['answer_options'] = [id2name[opt] for opt in entry['answer_options']]
            row['correct_ans_num'] = entry['correct_ans_num']
            row['context'] = entry['context']
            if 'distractor' in entry and entry['distractor'] is not None:
                row['distractor'] = id2name[entry['distractor']]
            else:
                row['distractor'] = None

            if model_answer_col not in model_columns:
                model_columns.append(model_answer_col)
            if model_eval_col not in model_columns:
                model_columns.append(model_eval_col)
            if 'model_id' not in model_columns:
                model_columns.append('model_id')
            if 'correct_ids' not in model_columns:
                model_columns.append('correct_ids')
            if 'answer_options' not in model_columns:
                model_columns.append('answer_options')
            if 'correct_ans_num' not in model_columns:
                model_columns.append('correct_ans_num')
            if 'distractor' not in model_columns:
                model_columns.append('distractor')
            if 'context' not in model_columns:
                model_columns.append('context')

            # Add the filled row to the rows list
            rows.append(row)

# Creating the DataFrame
df = pd.DataFrame(rows)
static_columns = ['certi', 'path_id', 'query', 'correct_answers', 'path_names']
ordered_columns = static_columns + sorted(model_columns)  # Sort or maintain order of model columns as needed
df = df[ordered_columns]
print(df.shape)

# Assuming 'df' is your DataFrame
correct_df = df[df['model_eval'] == 1]
incorrect_df = df[df['model_eval'] == 0]

# Calculate the maximum index to ensure we don't go out of bounds
max_index = len(df) - (len(df) % 10)

# Generate a list of start indices
start_indices = np.arange(0, max_index, 10)
print(max_index)
# List to hold the groups
grouped_rows = []

# Loop through each start index and get the consecutive 5 rows
for start in start_indices:
    if random.random() < 0.6:
        continue
    group = correct_df.iloc[start:start +1]  # Select the rows from 'start' to 'start+4'
    grouped_rows.append(group)

# Concatenate all the groups into a new DataFrame
sampled_df = pd.concat(grouped_rows)
#shuffle the rows
sampled_df = sampled_df.sample(frac=1).reset_index(drop=True)
print(sampled_df.shape)

incorrect_sampled_rows = []
start_indices = np.arange(0, len(incorrect_df), 2)
for start in start_indices:
    if random.random() < 0.1:
        continue
    group = incorrect_df.iloc[start:start + 1]
    incorrect_sampled_rows.append(group)
incorrect_sampled_df = pd.concat(incorrect_sampled_rows)
incorrect_sampled_df = incorrect_sampled_df.sample(frac=1).reset_index(drop=True)
print(incorrect_sampled_df.shape)
# Print the resulting DataFrame
sampled_df.head(15)
