In [None]:

## evaluation functions for geoparsers - takes in a json file containing
##  results from different models, which is already formatted and only contains resulting location that HAVE a coordinate

from math import radians, sin, cos, sqrt, atan2

def calculate_distance(coord1, coord2):
    # Function to calculate distance between two coordinates
    # Convert latitude and longitude from degrees to radians
    lat1, lon1 = radians(coord1[0]), radians(coord1[1])
    lat2, lon2 = radians(coord2[0]), radians(coord2[1])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = 6371 * c  # Radius of Earth in kilometers
    return distance

def compute_precision_recall_f1(instances, _truth, _pred):
    total_true_positives = 0
    total_false_positives = 0
    total_false_negatives = 0
    matched_coordinates = []

    for instance in instances:
        ground_truth = instance[_truth]
        predicted = instance[_pred]
        matched_ground_truth = set()  # To keep track of matched ground truth elements
        
        true_positives = 0
        for pred_key, pred_coord in predicted.items():
            matched = False
            for gt_key, gt_coord in ground_truth.items():
                if pred_key.lower() in gt_key.lower() or gt_key.lower() in pred_key.lower():
                    if gt_key not in matched_ground_truth:  # Ensure we don't count the same ground truth multiple times
                        true_positives += 1
                        matched_ground_truth.add(gt_key)
                        matched_coordinates.append((pred_coord, gt_coord))
                        matched = True
                        break
            
            # False positives are elements in predicted that did not match any ground truth element
            if not matched:
                total_false_positives += 1
        
        # False negatives are ground truth elements that did not match any predicted element
        total_false_negatives += len(ground_truth) - len(matched_ground_truth)
        total_true_positives += true_positives
    
    precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0
    recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0
    
    # Calculate F1 score
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return precision, recall, f1_score, matched_coordinates

def calculate_A_at_k(matched_coordinates, k):
    # Function to calculate accuracy at k (A@k)
    correct_matches = 0
    for pred_coord, truth_coord in matched_coordinates:
        if calculate_distance(pred_coord, truth_coord) <= k:
            correct_matches += 1

    accuracy_at_k = (correct_matches / len(matched_coordinates)) * 100 if matched_coordinates else 0
    return accuracy_at_k


In [None]:
## this is how the above code is called
import json
import pprint
with open("web_corpora_geoparsed_llm.json", 'r') as json_file:
        instances = json.load(json_file)
#print([k for k,v in instances[0].items()])
res = []
en = ['locations','camcoder_en', 'clavin_en', 'spacy_dict_en','spacy_gn_en','mordecai_en', 'llm_en', 'edinburgh_en']
de = ['locations_de', 'camcoder_de', 'clavin_de', 'spacy_dict_de', 'spacy_gn_de', 'mordecai_de', 'llm_de' ,'edinburgh_de']
ro = ['locations_ro', 'camcoder_ro', 'clavin_ro', 'spacy_dict_ro', 'spacy_gn_ro', 'mordecai_ro', 'llm_ro', 'edinburgh_ro']
for item in en:
        # Calculate precision, recall, and F1 score
        precision, recall, f1_score, matched_coordinates = compute_precision_recall_f1(instances, "locations", item)
        res.append({
                "model" : item,
                "Precision": round(precision, 2),
                "Recall": round(recall, 2),
                "F1 Score": round(f1_score, 2),
                "A@161": round(calculate_A_at_k(matched_coordinates, 161),2),
                "A@10": round(calculate_A_at_k(matched_coordinates, 10),2)
        })
for item in de:
        # Calculate precision, recall, and F1 score
        precision, recall, f1_score, matched_coordinates = compute_precision_recall_f1(instances, "locations_de", item)
        res.append( {
                "model" : item,
                "Precision": round(precision, 2),
                "Recall": round(recall, 2),
                "F1 Score": round(f1_score, 2),
                "A@161": round(calculate_A_at_k(matched_coordinates, 161),2),
                "A@10": round(calculate_A_at_k(matched_coordinates, 10),2)
        })

for item in ro:
        # Calculate precision, recall, and F1 score
        precision, recall, f1_score, matched_coordinates = compute_precision_recall_f1(instances, "locations_ro", item)
        res.append({
                "model" : item,
                "Precision": round(precision, 2),
                "Recall": round(recall, 2),
                "F1 Score": round(f1_score, 2),
                "A@161": round(calculate_A_at_k(matched_coordinates, 161),2),
                "A@10": round(calculate_A_at_k(matched_coordinates, 10),2)
        })
# with open("xx.json", 'w') as json_file:
#         json.dump(res, json_file, indent=4)

        # print("Precision:", precision)
        # print("Recall:", recall)
        # print("F1 Score:", f1_score)

        # # Calculate A@161 and A@10
        # accuracy_at_161 = calculate_A_at_k(matched_coordinates, 161)
        # accuracy_at_10 = calculate_A_at_k(matched_coordinates, 10)
        # print("A@161:", accuracy_at_161)
        # print("A@10:", accuracy_at_10)

In [None]:
## NER evaluation - same strategy as above, I ran NER experiments separately and
# there we of course do not retrieve coordinates hence everything predicted is taken into account

def compute_precision_recall_f1(ground_truths, predicted_lists):
    total_true_positives = 0
    total_false_positives = 0
    total_false_negatives = 0
    
    for key in ground_truths:
        ground_truth = ground_truths[key]
        predicted = predicted_lists[key]
        
        true_positives = 0
        matched_ground_truth = set()  # To keep track of matched ground truth elements
        
        for pred in predicted:
            matched = False
            for gt in ground_truth:
                if pred.lower() in gt.lower() or gt.lower() in pred.lower():
                    if gt not in matched_ground_truth:  # Ensure we don't count the same ground truth multiple times
                        true_positives += 1
                        matched_ground_truth.add(gt)
                        matched = True
                        break
            
            # False positives are elements in predicted that did not match any ground truth element
            if not matched:
                total_false_positives += 1
        
        # False negatives are ground truth elements that did not match any predicted element
        total_false_negatives += len(ground_truth) - len(matched_ground_truth)
        total_true_positives += true_positives
    
    precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0
    recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0
    
    # Calculate F1 score
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return precision, recall, f1_score

In [None]:
import pprint
import json
def get_eval_res_new (data_file, key, x):
    with open(data_file, 'r') as json_file:
        data = json.load(json_file)

    result = []
    for model_ in x:  
        print(model_)
        ground_truths =  {item["id"]: item[key] for item in data}
        predicted_lists = {item["id"]: item[model_] for item in data}

        precision, recall, f1_score = compute_precision_recall_f1(ground_truths, predicted_lists)
        
        result.append({"model":model_.replace("pred_",""),
                       "precision": round(precision,2),
                       "recall": round(recall,2),
                       "f1_score": round(f1_score,2),
                      })
        
    # with open("./NER_final/"+x[0]+"_web_corpora_NER.json", 'w') as json_file:
    #     json.dump(result, json_file, indent=4)

en = ["en",'spacy_xx_en', 'spacy_en_en', 'gliner_en', 'stanNER_en', 'flair_en',  'gazpne_en']
de = ["de",'spacy_xx_de', 'spacy_en_de', 'gliner_de', 'stanNER_de', 'flair_de',  'gazpne_de']
ro = ["ro",'spacy_xx_ro', 'spacy_en_ro', 'gliner_ro', 'stanNER_ro', 'flair_ro',  'gazpne_ro']

get_eval_res_new("./NER_final/web_corpora_all_NER.json", "en", en)