In [19]:
import os
import json
import pandas as pd

def extract_verification_value(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
        response = data.get("response", "").upper()
        num_text_evidence = data.get("num text evidence", 0)
        num_image_evidence = data.get("num image evidence", 0)
        
        if "TRUE" in response:
            return "TRUE", num_text_evidence, num_image_evidence
        elif "FALSE" in response:
            return "FALSE", num_text_evidence, num_image_evidence
        elif "OUT-OF-CONTEXT" in response:
            return "OUT-OF-CONTEXT", num_text_evidence, num_image_evidence
        
    return None, num_text_evidence, num_image_evidence

current_dir = os.getcwd()

verification_results = []

for folder in os.listdir(current_dir):
    folder_path = os.path.join(current_dir, folder)
    if os.path.isdir(folder_path) and folder.isdigit():
        result_row = {"folder": int(folder)}
        for file_name in ["verification.json", "verification_allevi.json", "verification_noevi.json"]:
            file_path = os.path.join(folder_path, file_name)
            if os.path.exists(file_path):
                value, num_text_evidence, num_image_evidence = extract_verification_value(file_path)
                result_row[file_name] = value if value else "TRUE"
                result_row[f"{file_name}_num_text_evidence"] = num_text_evidence
                result_row[f"{file_name}_num_image_evidence"] = num_image_evidence
                if value == "FALSE":
                    result_row[file_name] = "MISCAPTIONED"
            else:
                result_row[file_name] = "TRUE"
                result_row[f"{file_name}_num_text_evidence"] = 0
                result_row[f"{file_name}_num_image_evidence"] = 0
        verification_results.append(result_row)

verification_df = pd.DataFrame(verification_results)

verification_df = verification_df.sort_values(by="folder").reset_index(drop=True)

In [20]:
verification_df.head()

Unnamed: 0,folder,verification.json,verification.json_num_text_evidence,verification.json_num_image_evidence,verification_allevi.json,verification_allevi.json_num_text_evidence,verification_allevi.json_num_image_evidence,verification_noevi.json,verification_noevi.json_num_text_evidence,verification_noevi.json_num_image_evidence
0,0,TRUE,1,0,TRUE,11,0,OUT-OF-CONTEXT,0,0
1,1,OUT-OF-CONTEXT,0,0,OUT-OF-CONTEXT,14,4,OUT-OF-CONTEXT,0,0
2,2,TRUE,0,0,OUT-OF-CONTEXT,9,1,OUT-OF-CONTEXT,0,0
3,3,OUT-OF-CONTEXT,2,6,OUT-OF-CONTEXT,2,6,OUT-OF-CONTEXT,0,0
4,4,TRUE,2,2,MISCAPTIONED,7,7,OUT-OF-CONTEXT,0,0


In [21]:
verification_df["verification.json"].value_counts()

verification.json
OUT-OF-CONTEXT    529
TRUE              272
MISCAPTIONED       91
Name: count, dtype: int64

In [22]:
orig_df = pd.read_csv("VERITE.csv")
orig_df["Ground Truth"] = orig_df["label"].str.upper()
orig_df.head()

Unnamed: 0.1,Unnamed: 0,index,caption,image_path,label,Ground Truth
0,0,0,Photograph shows a family that was taken away ...,images/true_0.jpg,TRUE,TRUE
1,1,1,Photograph shows Chinese officials in white pr...,images/true_0.jpg,miscaptioned,MISCAPTIONED
2,2,2,Photograph shows a family that was taken away ...,images/false_0.jpg,out-of-context,OUT-OF-CONTEXT
3,3,3,Image shows electric scooters abandoned due to...,images/true_1.jpg,TRUE,TRUE
4,4,4,Image shows electric green scooters that have ...,images/true_1.jpg,miscaptioned,MISCAPTIONED


In [23]:
orig_df["Ground Truth"].value_counts()

Ground Truth
TRUE              302
MISCAPTIONED      302
OUT-OF-CONTEXT    288
Name: count, dtype: int64

In [24]:
verification_df["Ground Truth"] = orig_df["Ground Truth"]
verification_df.head()

Unnamed: 0,folder,verification.json,verification.json_num_text_evidence,verification.json_num_image_evidence,verification_allevi.json,verification_allevi.json_num_text_evidence,verification_allevi.json_num_image_evidence,verification_noevi.json,verification_noevi.json_num_text_evidence,verification_noevi.json_num_image_evidence,Ground Truth
0,0,TRUE,1,0,TRUE,11,0,OUT-OF-CONTEXT,0,0,TRUE
1,1,OUT-OF-CONTEXT,0,0,OUT-OF-CONTEXT,14,4,OUT-OF-CONTEXT,0,0,MISCAPTIONED
2,2,TRUE,0,0,OUT-OF-CONTEXT,9,1,OUT-OF-CONTEXT,0,0,OUT-OF-CONTEXT
3,3,OUT-OF-CONTEXT,2,6,OUT-OF-CONTEXT,2,6,OUT-OF-CONTEXT,0,0,TRUE
4,4,TRUE,2,2,MISCAPTIONED,7,7,OUT-OF-CONTEXT,0,0,MISCAPTIONED


In [25]:
verification_df["Ground Truth"].value_counts()

Ground Truth
TRUE              302
MISCAPTIONED      302
OUT-OF-CONTEXT    288
Name: count, dtype: int64

In [26]:
columns_to_transform = ["verification.json", "verification_allevi.json", "verification_noevi.json", "Ground Truth"]

# UNCOMMENT THIS IF YOU WANT THE BINARY VERSION RESULTS
#for col in columns_to_transform:
#    verification_df[col] = verification_df[col].apply(lambda x: "FALSE" if x in ["MISCAPTIONED", "OUT-OF-CONTEXT"] else x)

In [27]:
verification_df.head()

Unnamed: 0,folder,verification.json,verification.json_num_text_evidence,verification.json_num_image_evidence,verification_allevi.json,verification_allevi.json_num_text_evidence,verification_allevi.json_num_image_evidence,verification_noevi.json,verification_noevi.json_num_text_evidence,verification_noevi.json_num_image_evidence,Ground Truth
0,0,TRUE,1,0,TRUE,11,0,OUT-OF-CONTEXT,0,0,TRUE
1,1,OUT-OF-CONTEXT,0,0,OUT-OF-CONTEXT,14,4,OUT-OF-CONTEXT,0,0,MISCAPTIONED
2,2,TRUE,0,0,OUT-OF-CONTEXT,9,1,OUT-OF-CONTEXT,0,0,OUT-OF-CONTEXT
3,3,OUT-OF-CONTEXT,2,6,OUT-OF-CONTEXT,2,6,OUT-OF-CONTEXT,0,0,TRUE
4,4,TRUE,2,2,MISCAPTIONED,7,7,OUT-OF-CONTEXT,0,0,MISCAPTIONED


In [28]:
verification_df["Ground Truth"].value_counts()

Ground Truth
TRUE              302
MISCAPTIONED      302
OUT-OF-CONTEXT    288
Name: count, dtype: int64

In [29]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def calculate_metrics(predictions, ground_truth):
    accuracy = accuracy_score(ground_truth, predictions)
    precision = precision_score(ground_truth, predictions, average='weighted', zero_division=0)
    recall = recall_score(ground_truth, predictions, average='weighted', zero_division=0)
    f1 = f1_score(ground_truth, predictions, average='weighted', zero_division=0)
    return accuracy, precision, recall, f1

In [30]:
ground_truth = verification_df["Ground Truth"]

In [31]:
metrics = {}
for col in ["verification.json", "verification_allevi.json", "verification_noevi.json"]:
    predictions = verification_df[col]
    metrics[col] = calculate_metrics(predictions, ground_truth)

metrics_df = pd.DataFrame.from_dict(metrics, orient='index', columns=["Accuracy", "Precision", "Recall", "F1"])

In [32]:
metrics_df.head()

Unnamed: 0,Accuracy,Precision,Recall,F1
verification.json,0.486547,0.535915,0.486547,0.461519
verification_allevi.json,0.477578,0.51666,0.477578,0.459881
verification_noevi.json,0.42713,0.479891,0.42713,0.408103


In [33]:
from sklearn.metrics import classification_report

y_true = ground_truth
y_pred = verification_df["verification.json"]
print(classification_report(y_true,y_pred))

                precision    recall  f1-score   support

  MISCAPTIONED       0.54      0.16      0.25       302
OUT-OF-CONTEXT       0.37      0.69      0.48       288
          TRUE       0.69      0.62      0.65       302

      accuracy                           0.49       892
     macro avg       0.53      0.49      0.46       892
  weighted avg       0.54      0.49      0.46       892

