# Extract Information

In [25]:
import json
from pathlib import Path
import sys
import re
import ast
root_path = Path('.').resolve().parents[1]
dataset_path = root_path / 'xmode-ver-01.json'


In [19]:
with open(dataset_path, 'r') as f:
    data = json.load(f)
    
def parse_source(source: str):
    """input
    Image analysis of last study (image_id: fc252cec-cb2bd316-a6df094b-dfd88ffa-b9315b50) and previous study (image_id: 13ba579e-736450c7-f8722c82-7cb94950-f21fe768)
    Output:
    {
        "image_id": ["fc252cec-cb2bd316-a6df094b-dfd88ffa-b9315b50", "13ba579e-736450c7-f8722c82-7cb94950-f21fe768"]
    }
    """
    image_id_pattern = re.compile(r"([a-f0-9-]{44})")
    image_id_res = image_id_pattern.findall(source)
    # pattern for getting  8-digits number
    study_id_pattern = re.compile(r"([0-9]{8})")
    
    study_id_res = study_id_pattern.findall(source)
    res = {
        "image_id": image_id_res,
        "study_id": study_id_res,
        "text": source
    }
    return res

def extract_prediction(data: list):
    # only get the first prediction
    # Check if it has the following keys
    if len(data) == 0:
        return {
            "pred_summary": None,
            "pred_details": None,
            "pred_source": None,
            "predict_answer": None,
            "pred_explanation": None
        }
    data = data[0]
    keys = {"Summary":"pred_summary", "details": "pred_details", "source": "pred_source", "final answer": "predict_answer", "extra explanation": "pred_explanation"}
    res = {}
    for key in keys:
        if key == 'source':
            temp = parse_source(data.get(key, None))
        temp = data.get(key, None)
        res[keys[key]] = temp
    return res
        
res = []
for i, item in enumerate(data):
    if item.get('prediction', None):
        temp = {
            "db_id": item['db_id'],
            "question": item['question'],
            "answer": item['answer'],
        }
    temp.update(extract_prediction(item['prediction']))
    res.append(temp)

# eval

In [21]:
import pandas as pd
# extract the prediction 
pd.DataFrame(res).to_csv('eval_xmode-ver-01.csv', index=False)

In [26]:
# if the results contains only No => 0, Yes => 1, 
# if the reults contains neither No nor Yes, but detected => 1
# if the reulsts contains both No and Yes => -1
# otherwise => -1
def get_binary_label(x, y):
    y = y[0]
    res = -1
    not_list = ['no', 'not']
    yes_list = ['yes', 'detected']
    if x == None:
        return res == y
    for no in not_list:
        if no in x.lower():
            res = 0
            break
    for yes in yes_list:
        if yes in x.lower():
            if res == 0:
                res = -1
                return res == y
            else:
                res = 1
                break
    return res == y

def get_list_results(x, y):
    if len(x) != len(y):
        return False
    # check if all elements in x is in y
    for i in x:
        if i not in y:
            return False
    # check if all elements in y is in x
    for i in y:
        if i not in x:
            return False
    return True

def parse_str(x):
    try:
        return ast.literal_eval(x)
    except:
        return x

def get_results(x, y):
    x = parse_str(x)
    y = parse_str(y)
    if len(y) == 1 and isinstance(y[0], int) and isinstance(x, str):
        return get_binary_label(x, y)
    if isinstance(x, list) and isinstance(y, list):
        return get_list_results(x, y)
    else:
        return False


In [27]:
data = pd.read_csv('eval_xmode-ver-01.csv', index_col=False)
labels = data.apply(lambda x: get_results(x['predict_answer'], x['answer']), axis=1)

In [29]:
data['labels'] = labels

In [31]:
# calculate the accuracy
accuracy = labels.sum() / len(labels)

In [32]:
accuracy

0.36666666666666664

In [30]:
data.to_csv('eval_xmode-ver-01.csv', index=False)