In [None]:
import os
import pickle
import numpy as np
from statsmodels.stats.proportion import proportion_confint
import copy
import shutil
import json
from utils import *

In [None]:
RESULTS_DIR = 'final_results'
model2dir = {}
for folder in os.listdir(RESULTS_DIR):
    model2dir[folder] = os.path.join(RESULTS_DIR, folder)
    print(f"{folder} has {len(os.listdir(f'{RESULTS_DIR}/{folder}'))} certificates")
len(model2dir)

In [None]:
MIN_CERTIFICATES = 50
remove_models = []
for model, dir in model2dir.items():
    if len(os.listdir(dir)) < MIN_CERTIFICATES:
        print(model)
        remove_models.append(model)

for model in remove_models:
    del model2dir[model]

In [None]:
#Certifications

model2certificates = {}
NUM_SAMPLES = 250
for model, dir in model2dir.items():
    model2certificates[model] = {}
    for cert in os.listdir(dir):
        if not cert.endswith('.pkl'):
            continue
        cert_path = os.path.join(dir, cert)
        experiment_results = pickle.load(open(cert_path, 'rb'))
        if len(experiment_results) == 3: #if we store (detailed_results, correct answers num , total queries num)
            experiment_results = experiment_results[0]
        correct = 0
        total = 0
        for i, result in enumerate(experiment_results):
            if i >= NUM_SAMPLES:
                break
            correct += result['result'][0]
            total += 1
        (lower, upper) = proportion_confint(correct, total, alpha=0.05, method='beta')
        model2certificates[model][cert] = (correct, total, lower, upper, correct/total)

#Print summary

model_result_summary = {}
for model, certificates in model2certificates.items():
    print(f"Model: {model}")
    certificates_lower = [cert[2] for cert in certificates.values()]
    certificates_upper = [cert[3] for cert in certificates.values()]
    certificates_acc = [cert[4] for cert in certificates.values()]
    print(f" Mean lower bound: {np.mean(certificates_lower)}, Mean upper bound: {np.mean(certificates_upper)}, Mean accuracy: {np.mean(certificates_acc)}")
    print(f"Std lower bound: {np.std(certificates_lower)}, Std upper bound: {np.std(certificates_upper)}, Sd accuracy: {np.std(certificates_acc)}")
    
    model_result_summary[model] = (np.mean(certificates_lower), np.mean(certificates_upper), np.mean(certificates_acc), np.std(certificates_lower), np.std(certificates_upper), np.std(certificates_acc))

#Save summary
SAVE_FILE = 'model_summary.json'
json.dump(model_result_summary, open(SAVE_FILE, 'w'), indent=4)

Use the below to get structures for more detailed analysis as outlined below

In [None]:
#We may need this for more detailed analysis
#qa_graph = json.load(open('wikidata_graphs/wikidata_util.json'))
# context_graph_edge = json.load(open('wikidata_graphs/wikidata_text_edge.json'))
# graph_text_sentencized = json.load(open('wikidata_graphs/wikidata_sentencized.json'))
#id2name = json.load(open('wikidata_graphs/wikidata_name_id.json'))
# entity_aliases = load_aliases('wikidata5m_entity.txt')
# relation_aliases = load_aliases('wikidata5m_relation.txt')

In [None]:
# model_files = {k:[] for k in model2dir.keys()}
# results = {k:[] for k in model2dir.keys()}
# results_data = {k:[] for k in model2dir.keys()}
# all_entities = []
# results_ids = {k:{} for k in model2dir.keys()}
# question_answers_models = {}
# model2ks = {}
# NUM_SAMPLES = 250
# for model, dir in model2dir.items():
#     ks = set()
#     for file in os.listdir(dir):
#         try:
#             experiment_results = pickle.load(open(os.path.join(dir, file), 'rb'))
#         except:
#             print(os.path.join(dir, file))
#             continue
#         correct = 0
#         total = 0
#         idx = file.index('Q')
#         all_entities.append(file[idx:-4])
#         model_files[model].append(file[idx:-4])
#         unique_queries = {}
#         repeat_queries = 0
#         unique_paths = set()
#         if len(experiment_results) < 10:
#             experiment_results = experiment_results[0]
#         for i, result in enumerate(experiment_results):
#             if i >= NUM_SAMPLES:
#                 break
#             if type(result) == str:
#                 print(result)
#                 print(os.path.join(dir, file))
#                 raise ValueError('Error in file: ', file)
#             if result['result'][0] == 1:
#                 correct += 1
#             if result['question'] not in unique_queries:
#                 unique_queries[result['question']] = []
#             path = tuple(result['path_id'])
#             ks.add(len(path)-1)
#             unique_paths.add(path)
#             if path not in question_answers_models:
#                 question_answers_models[path]= {}
            
#             if model not in question_answers_models[path]:
#                 question_answers_models[path][model] = []
#             if 'distractor' in result:
#                 question_answers_models[path][model].append({'query': result['question'], 'eval': result['result'][0], 
#                                                          'context': result['context'], 'model_answer':result['model_answer'], 
#                                                          'correct_answers':result['correct_answers'], 'correct_ids':result['correct_ids'],
#                                                          'answer_options':result['options'], 'correct_ans_num':result['correct_ans_num'], 
#                                                          'distractor':result['distractor']})
#             else:
#                 question_answers_models[path][model].append({'query': result['question'], 'eval': result['result'][0], 
#                                                          'context': result['context'], 'model_answer':result['model_answer'], 
#                                                          'correct_answers':result['correct_answers'], 'correct_ids':result['correct_ids'],
#                                                          'answer_options':result['options'], 'correct_ans_num':result['correct_ans_num']})
#             unique_queries[result['question']].append(result['result'][0])
#             total += 1
#         if total > NUM_SAMPLES:
#             raise ValueError('Error in file total: ', file, total)
#         results_ids[model][file[idx:-4]] = (correct, total, correct/total)
#         results[model].append(proportion_confint(correct, total, method='beta'))
#         repeat_queries = total - len(unique_queries)
#         same_query_accuracy = 0
#         count_same = 0
#         for query in unique_queries:
#             if len(unique_queries[query]) > 1:
#                 if np.mean(unique_queries[query]) > 0:
#                     same_query_accuracy += np.mean(unique_queries[query])
#                     count_same += 1
#         if count_same == 0:
#             same_query_accuracy = 1.0
#         else:
#             same_query_accuracy = same_query_accuracy/count_same
#         results_data[model].append((correct, total, len(unique_queries), repeat_queries, same_query_accuracy))
#     model2ks[model] = ks
# print("total number of subgraphs: ", {key:len(value)for key, value in results.items()}, 
#       "\nmean lower bounds per subgraph: ", {key:(np.mean([v[0] for v in value]), np.std([v[0] for v in value])) for key, value in results.items()}, 
#       "\nmean upper bounds per subgraph: ", {key:(np.mean([v[1] for v in value]), np.std([v[1] for v in value])) for key, value in results.items()},
#       '\nmean total questions per subgraph: ', {key:(np.mean([v[1] for v in value]), np.std([v[1] for v in value])) for key, value in results_data.items()}, 
#       "\nmean correct answers per subgraph: ", {key:(np.mean([v[0] for v in results_data[key]]), np.std([v[1] for v in results_data[key]])) for key in results_data.keys()},
#       "\nmean accuracy per subgraph: ", {key:(np.mean([v[0]/v[1] for v in value]), np.std([v[0]/v[1] for v in value])) for key, value in results_data.items()},
#       "\nmean unique queries per subgraph: ", {key:(np.mean([v[2] for v in value]), np.std([v[2] for v in value])) for key, value in results_data.items()},
#       "\nmean repeat queries per subgraph: ", {key:(np.mean([v[3] for v in value]), np.std([v[3] for v in value])) for key, value in results_data.items()},
#       "\nmean same query accuracy per subgraph: ", {key:(np.mean([v[4] for v in value]), np.std([v[4] for v in value])) for key, value in results_data.items()})

Get answer samples

In [None]:
# import pandas as pd
# import random

# # Assuming 'question_answers_models' is defined and contains your data
# rows = []
# model_columns = []
# for path_id, models in question_answers_models.items():
#     # Iterate over each model and their entries
#     for model_id, entries in models.items():
#         # Process each entry for the current model
#         entry = random.choice(entries)
#         path_names = [id2name[x] for x in path_id]
#         row = {'path_id': path_id, 'path_names':path_names}
#         # Include query, context, and correct answers
#         sampled_values = {'query': entry['query'], 'correct_answers': entry['correct_answers'], 'answer_options':entry['answer_options'], 
#                           'correct_ans_num':entry['correct_ans_num'], 'context': entry['context']}
#         row.update(sampled_values)

#         # Include model's answer and its evaluation
#         model_answer_col = 'model_answer'
#         model_eval_col = 'model_eval'
#         row['model_id'] = model_id
#         row['correct_ids'] = entry['correct_ids']
#         row[model_answer_col] = entry['model_answer']
#         row[model_eval_col] = entry['eval']
#         row['answer_options'] = [id2name[opt] for opt in entry['answer_options']]
#         row['correct_ans_num'] = entry['correct_ans_num']
#         row['context'] = entry['context']
#         if 'distractor' in entry and entry['distractor'] is not None:
#             row['distractor'] = id2name[entry['distractor']]
#         else:
#             row['distractor'] = None

#         if model_answer_col not in model_columns:
#             model_columns.append(model_answer_col)
#         if model_eval_col not in model_columns:
#             model_columns.append(model_eval_col)
#         if 'model_id' not in model_columns:
#             model_columns.append('model_id')
#         if 'correct_ids' not in model_columns:
#             model_columns.append('correct_ids')
#         if 'answer_options' not in model_columns:
#             model_columns.append('answer_options')
#         if 'correct_ans_num' not in model_columns:
#             model_columns.append('correct_ans_num')
#         if 'distractor' not in model_columns:
#             model_columns.append('distractor')
#         if 'context' not in model_columns:
#             model_columns.append('context')

#         # Add the filled row to the rows list
#         rows.append(row)

# # Creating the DataFrame
# df = pd.DataFrame(rows)
# static_columns = ['path_id', 'query', 'correct_answers', 'path_names']
# ordered_columns = static_columns + sorted(model_columns)  # Sort or maintain order of model columns as needed
# df = df[ordered_columns]
# print(df.shape)

# # Assuming 'df' is your DataFrame

# # Calculate the maximum index to ensure we don't go out of bounds
# max_index = len(df) - (len(df) % 121)

# # Generate a list of start indices
# start_indices = np.arange(0, max_index, 121)

# # List to hold the groups
# grouped_rows = []

# # Loop through each start index and get the consecutive 5 rows
# for start in start_indices:
#     if random.random() < 0.6:
#         continue
#     group = df.iloc[start:start +11]  # Select the rows from 'start' to 'start+4'
#     grouped_rows.append(group)

# # Concatenate all the groups into a new DataFrame
# sampled_df = pd.concat(grouped_rows)

# print(sampled_df.shape)
# # Print the resulting DataFrame
# sampled_df.head(15)
