# Evaluation

## Preparations

In [1]:
import sys, os
import reload_recursive

# sys path hack to allow importing the encoding functions and other modules
sys.path.insert(0, os.path.abspath("../src"))
sys.path.insert(0, os.path.abspath("../externals"))

In [2]:
import torch 

if torch.cuda.is_available():
    gpu = torch.device("cuda")
elif torch.backends.mps.is_available():
    gpu = torch.device("mps")
else:
    print("Warning: no GPU detected, falling back to CPU")
    gpu = torch.device("cpu")

In [3]:
# Clean old model from cache
if "model" in locals() or "model" in globals():
    del model
torch.cuda.empty_cache()

%reload model.clip_model
from model.clip_model import CLIPModel
model = CLIPModel(gpu)

In [4]:
# Clean old model from cache
if "object_detector" in locals() or "object_detector" in globals():
    del object_detector
torch.cuda.empty_cache()

%reload object_detection.owl_vit_object_detector
from object_detection.owl_vit_object_detector import OWLViTObjectDetector
object_detector = OWLViTObjectDetector(gpu)

## Dataset

In [20]:
import json 

with open("../data/questions/val_sampled_questions_10000.json") as f:
   questions = list(json.load(f).items())

## Evaluation

In [21]:
from clingo.control import Control 
%reload pipeline.encoding
from pipeline.encoding import encode_question, encode_scene, sanitize_asp

def answer_is_correct(answers, correct_answer):
    correct = False 

    for answer in answers:
        if answer == sanitize_asp(correct_answer): 
            correct = True
        elif (answer == "to_the_right_of" and correct_answer == "right") or \
            (answer == "to_the_left_of" and correct_answer == "left") or \
            (answer == "in_front_of" and correct_answer == "front"):
            correct = True
    return correct 

def count_operators(question):
    operations = ["select", "query", "filter", "relate", "verify", "choose", "exist", "or", "different", "and", "same", "common"]
    op_counts = {f"op_{op}": 0 for op in operations}
    for op in question["semantic"]:
        operator = op["operation"].split(" ")[0]
        op_counts[f"op_{operator}"] += 1
    return op_counts

def is_scene_question(question):
    return question["semantic"][0]["operation"] == "select" and question["semantic"][0]["argument"] == "scene"

def evaluate_question(question, asp_theory): 
    op_counts = count_operators(question)
    result = {
        "question_id": question["qid"], 
        "semantic_str": question["semanticStr"], 
        "image_id": question["imageId"],
        "answer": question["answer"],
        **op_counts
    }
    
    if is_scene_question(question):
        return {**result, "skipped": True, "model_response": None, "correct": False, "timeout": False}
    else:
        result["skipped"] = False
        ctl = Control()
        ctl.add(asp_theory)

        scene_encoding = encode_scene(question, model, object_detector)
        question_encoding = encode_question(question)

        with open(f"../data/encoded_questions/{qid}.lp", "w") as f:
            f.write("% ------ scene encoding ------\n")
            f.write(scene_encoding)
            f.write("\n% ------ question encoding ------\n")
            f.write(question_encoding)

        ctl.add(scene_encoding)
        ctl.add(question_encoding)

        answers = [[]]
        def on_model(model):
            answers[0] = [s.arguments[0].name for s in model.symbols(shown=True)]

        ctl.ground()
        handle = ctl.solve(on_model=on_model, async_ = True)
        has_finished = handle.wait(timeout=10.0)
        result["timeout"] = not has_finished

        if len(answers[0]) > 0:
            return {**result, "model_response": answers[0], "correct": answer_is_correct(answers[0], question["answer"])}
        else: 
            return {**result, "model_response": "UNSAT", "correct": False}

In [22]:
from itertools import islice
from tqdm.notebook import tqdm 
import pandas as pd
import os
import json

with open("../src/pipeline/encoding/theory.lp") as theory_file:
    theory = theory_file.read()

num_questions = 1000
results = []
history = []
snapshot_steps = 10
snapshot_dir = "evaluations/val_1000_2"

if not os.path.exists(snapshot_dir):
    os.mkdir(snapshot_dir)

progress_bar = tqdm(total=num_questions, desc="Questions")
for i, (qid, question) in enumerate(list(islice(questions, 0, num_questions))):
    question["qid"] = qid
    results.append(evaluate_question(question, theory))

    if i % snapshot_steps == (snapshot_steps - 1):
        results_pd = pd.DataFrame(results)
        results_pd.to_pickle(f"{snapshot_dir}/results_snapshot.pkl")
        num_skipped = results_pd[results_pd["skipped"]].shape[0]
        num_correct = results_pd[results_pd["correct"]].shape[0]
        num_incorrect = results_pd[~results_pd["skipped"] & ~results_pd["correct"]].shape[0]
        num_unsat =  results_pd[results_pd["model_response"] == "UNSAT"].shape[0]

        history.append({"step": i+1, "correct": num_correct, "incorrect": num_incorrect, "unsat": num_unsat, "skipped": num_skipped, "correct_percentage": num_correct/(num_incorrect+num_correct)*100})
        with open(f"{snapshot_dir}/history.json", 'w') as f:
            json.dump(history, f, indent=4)

        progress_bar.write(f"Step {i+1:7d}: Corr {num_correct:7d}, Incorr {num_incorrect:7d}, UNSAT {num_unsat:7d}, Skip {num_skipped:5d}, Corr %: {(num_correct/(num_incorrect+num_correct)*100):.4f}%")

    progress_bar.update(1)

Questions:   0%|          | 0/1000 [00:00<?, ?it/s]

Step      10: Corr       7, Incorr       3, UNSAT       0, Skip     0, Corr %: 70.0000%
Step      20: Corr      11, Incorr       8, UNSAT       1, Skip     1, Corr %: 57.8947%
Step      30: Corr      14, Incorr      14, UNSAT       2, Skip     2, Corr %: 50.0000%
Step      40: Corr      18, Incorr      19, UNSAT       3, Skip     3, Corr %: 48.6486%
Step      50: Corr      20, Incorr      27, UNSAT       3, Skip     3, Corr %: 42.5532%
Step      60: Corr      24, Incorr      33, UNSAT       3, Skip     3, Corr %: 42.1053%
Step      70: Corr      30, Incorr      37, UNSAT       3, Skip     3, Corr %: 44.7761%
Step      80: Corr      35, Incorr      42, UNSAT       3, Skip     3, Corr %: 45.4545%
Step      90: Corr      41, Incorr      46, UNSAT       3, Skip     3, Corr %: 47.1264%
Step     100: Corr      43, Incorr      54, UNSAT       3, Skip     3, Corr %: 44.3299%
Step     110: Corr      45, Incorr      59, UNSAT       4, Skip     6, Corr %: 43.2692%
Step     120: Corr      49, Inco

KeyboardInterrupt: 