# Evaluation

## Preparations

In [1]:
import sys, os
import reload_recursive

# sys path hack to allow importing the encoding functions and other modules
sys.path.insert(0, os.path.abspath("../src"))
sys.path.insert(0, os.path.abspath("../externals"))

In [2]:
import torch 

if torch.cuda.is_available():
    gpu = torch.device("cuda")
elif torch.backends.mps.is_available():
    gpu = torch.device("mps")
else:
    print("Warning: no GPU detected, falling back to CPU")
    gpu = torch.device("cpu")

In [3]:
# Clean old model from cache
if "model" in locals() or "model" in globals():
    del model
torch.cuda.empty_cache()

%reload model.clip_model
from model.clip_model import CLIPModel
model = CLIPModel(gpu, model="openai/clip-vit-base-patch32")

In [4]:
# Clean old model from cache
if "object_detector" in locals() or "object_detector" in globals():
    del object_detector
torch.cuda.empty_cache()

%reload object_detection.owl_vit_object_detector
from object_detection.owl_vit_object_detector import OWLViTObjectDetector
object_detector = OWLViTObjectDetector(gpu)

## Dataset

In [5]:
import json 

with open("../data/questions/val_sampled_questions_10000.json") as f:
   questions = list(json.load(f).items())

## Evaluation

In [6]:
from clingo.control import Control 
%reload pipeline.encoding
from pipeline.encoding import encode_question, encode_scene, sanitize_asp
import time

def answer_is_correct(answers, correct_answer):
    correct = False 

    for answer in answers:
        if answer == sanitize_asp(correct_answer): 
            correct = True
        elif (answer == "to_the_right_of" and correct_answer == "right") or \
            (answer == "to_the_left_of" and correct_answer == "left") or \
            (answer == "in_front_of" and correct_answer == "front"):
            correct = True
    return correct 

def count_operators(question):
    operations = ["select", "query", "filter", "relate", "verify", "choose", "exist", "or", "different", "and", "same", "common"]
    op_counts = {f"op_{op}": 0 for op in operations}
    for op in question["semantic"]:
        operator = op["operation"].split(" ")[0]
        op_counts[f"op_{operator}"] += 1
    return op_counts

def is_scene_question(question):
    return question["semantic"][0]["operation"] == "select" and question["semantic"][0]["argument"] == "scene"

def evaluate_question(question, asp_theory): 
    op_counts = count_operators(question)
    result = {
        "question_id": question["qid"], 
        "semantic_str": question["semanticStr"], 
        "image_id": question["imageId"],
        "answer": question["answer"],
        **op_counts
    }
    
    if is_scene_question(question):
        return {**result, "skipped": True, "model_response": None, "correct": False, "timeout": False, "runtime_sec": 0.0}
    else:
        result["skipped"] = False
        start = time.time()
        ctl = Control()
        ctl.add(asp_theory)

        scene_encoding = encode_scene(question, model, object_detector)
        question_encoding = encode_question(question)

        with open(f"../data/encoded_questions/{qid}.lp", "w") as f:
            f.write("% ------ scene encoding ------\n")
            f.write(scene_encoding)
            f.write("\n% ------ question encoding ------\n")
            f.write(question_encoding)

        ctl.add(scene_encoding)
        ctl.add(question_encoding)

        answers = [[]]
        def on_model(model):
            answers[0] = [s.arguments[0].name for s in model.symbols(shown=True)]

        ctl.ground()
        handle = ctl.solve(on_model=on_model, async_ = True)
        has_finished = handle.wait(timeout=10.0)
        end = time.time()
        result["timeout"] = not has_finished
        result["runtime_sec"] = end - start

        if len(answers[0]) > 0:
            return {**result, "model_response": answers[0], "correct": answer_is_correct(answers[0], question["answer"])}
        else: 
            return {**result, "model_response": "UNSAT", "correct": False}

In [7]:
from itertools import islice
from tqdm.notebook import tqdm 
import pandas as pd
import os
import json

with open("../src/pipeline/encoding/theory.lp") as theory_file:
    theory = theory_file.read()

num_questions = 1000
results = []
history = []
last_step = -1
snapshot_steps = 200
snapshot_dir = "evaluations/val_1000"

if not os.path.exists(snapshot_dir):
    os.mkdir(snapshot_dir)

if os.path.isfile(f"{snapshot_dir}/history.json"):
    with open(f"{snapshot_dir}/history.json") as f:
        history = json.load(f)
        last_step = history[-1]["step"] - 1
    results_pd = pd.read_pickle(f"{snapshot_dir}/results_snapshot.pkl")
    results = results_pd.to_dict("records")

progress_bar = tqdm(total=num_questions, desc="Questions")

def report_results(i):
    results_pd = pd.DataFrame(results)
    results_pd.to_pickle(f"{snapshot_dir}/results_snapshot.pkl")
    num_skipped = results_pd[results_pd["skipped"]].shape[0]
    num_correct = results_pd[results_pd["correct"]].shape[0]
    num_incorrect = results_pd[~results_pd["skipped"] & ~results_pd["correct"]].shape[0]
    num_unsat =  results_pd[results_pd["model_response"] == "UNSAT"].shape[0]

    history.append({"step": i+1, "correct": num_correct, "incorrect": num_incorrect, "unsat": num_unsat, "skipped": num_skipped, "correct_percentage": num_correct/(num_incorrect+num_correct)*100})
    with open(f"{snapshot_dir}/history.json", 'w') as f:
        json.dump(history, f, indent=4)

    progress_bar.write(f"Step {i+1:7d}: Corr {num_correct:7d}, Incorr {num_incorrect:7d}, UNSAT {num_unsat:7d}, Skip {num_skipped:5d}, Corr %: {(num_correct/(num_incorrect+num_correct)*100):.4f}%")

for i, (qid, question) in enumerate(list(islice(questions, 0, num_questions))):
    if i <= last_step:
        progress_bar.update(1)
        continue
    
    question["qid"] = qid
    results.append(evaluate_question(question, theory))

    if i % snapshot_steps == (snapshot_steps - 1):
        report_results(i)
    progress_bar.update(1)

report_results(i)

Questions:   0%|          | 0/1000 [00:00<?, ?it/s]

Step     200: Corr      93, Incorr      98, UNSAT       1, Skip     9, Corr %: 48.6911%
Step     400: Corr     179, Incorr     208, UNSAT       1, Skip    13, Corr %: 46.2532%
Step     600: Corr     275, Incorr     304, UNSAT       4, Skip    21, Corr %: 47.4957%
Step     800: Corr     367, Incorr     402, UNSAT       4, Skip    31, Corr %: 47.7243%
Step    1000: Corr     458, Incorr     503, UNSAT       5, Skip    39, Corr %: 47.6587%
Step    1000: Corr     458, Incorr     503, UNSAT       5, Skip    39, Corr %: 47.6587%
