In [1]:
import pandas as pd

In [2]:
dev_data = pd.read_csv("OpenForge_ICPSR_Benchmark/openforge_icpsr_hyper_training.csv")

In [4]:
test_data = pd.read_csv("OpenForge_ICPSR_Benchmark/openforge_icpsr_hyper_test.csv")

In [5]:
test_data

Unnamed: 0,concept_1,concept_2,name_qgram_similarity,name_jaccard_similarity,name_edit_distance,name_fasttext_similarity,name_word_count_ratio,name_char_count_ratio,relation_variable_label,relation_variable_name
0,human settlements,administrative divisions,0.000000,0.0,18,0.287421,1.0,0.708333,1,R_1-2
1,human settlements,provinces,0.000000,0.0,15,0.187749,2.0,1.888889,1,R_1-3
2,human settlements,fatalities,0.000000,0.0,13,0.242180,2.0,1.700000,0,R_1-4
3,human settlements,homicide,0.000000,0.0,14,0.214332,2.0,2.125000,0,R_1-5
4,human settlements,murder,0.000000,0.0,15,0.194847,2.0,2.833333,0,R_1-6
...,...,...,...,...,...,...,...,...,...,...
1426,Cuban Revolution,imprisonment,0.000000,0.0,15,0.153960,2.0,1.333333,0,R_51-53
1427,Cuban Revolution,corporate sentencing,0.000000,0.0,16,0.106177,1.0,0.800000,0,R_51-54
1428,punishment,imprisonment,0.125000,0.0,6,0.518840,1.0,0.833333,1,R_52-53
1429,punishment,corporate sentencing,0.045455,0.0,16,0.424819,0.5,0.500000,1,R_52-54


In [6]:
def few_shots(n=10, balanced=True):
    if not balanced:
        return dev_data.sample(n)
    rst = pd.concat([dev_data[dev_data['relation_variable_label'] == 1].sample(n//2), dev_data[dev_data['relation_variable_label'] == 0].sample(n - n//2)])
    rst = rst.sample(frac=1)
    return rst[['concept_1', 'concept_2', 'relation_variable_label']]

In [11]:
prompts = []
labels = []

for _, row in test_data.iterrows():
    prompt = """Your task is to determine whether two concepts are hypernymy, which means that first concept has broader meaning than the second one. Return your final result in the following JSON format : {"answer": <yes or no>}.
"""
    fewshots_prompt = "\n\n".join(["Input:\nConcept 1: {}\nConcept 2: {}\nOutput:\n{}".format(row[0], row[1], '{"answer": "yes"}' if row[2] else '{"answer": "no"}') for row in few_shots().values.tolist()])
    # prompt += fewshots_prompt
    prompt += """
Input:
Concept 1: {}
Concept 2: {}

Output:
    """.format(row['concept_1'], row['concept_2'])
    prompts.append(prompt)
    labels.append(row['relation_variable_label'])

df = pd.DataFrame({"prompt": prompts, "label": labels})
df.to_json("icpsr_test_0shots.jsonl", orient="records", lines=True)

In [12]:
print(prompts[0])

Your task is to determine whether two concepts are hypernymy, which means that first concept has broader meaning than the second one. Return your final result in the following JSON format : {"answer": <yes or no>}.

Input:
Concept 1: human settlements
Concept 2: administrative divisions

Output:
    


## Evaluate

In [21]:
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
import re
import json
def evaluate(file):
    df = pd.read_json(file, lines=True)
    y_pred, y_true = [], []
    for _, row in df.iterrows():
        label = row['label']
        res = row['response']
        pattern = r'{[^}]*}'
        matches = re.findall(pattern, res)
        try:
            res = json.loads(matches[0].strip())
        except:
            print(res)
            continue
        y_true.append(label)
        pred = 1 if res['answer'] == "yes" else 0
        y_pred.append(pred)
    f1 = f1_score(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    rc = recall_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    print("f1: {:.5f}\tacc: {:.5f}\trecall: {:.5f}\tprecision: {:.5f}".format(f1, acc, rc, prec))

In [22]:
evaluate("icpsr_test_0shots.jsonl.gpt-3.5-turbo-1106.result.jsonl")

{"answer": no}
f1: 0.50000	acc: 0.94266	recall: 0.71930	precision: 0.38318


In [23]:
evaluate("icpsr_test_10shots.jsonl.gpt-3.5-turbo-1106.result.jsonl")

f1: 0.58065	acc: 0.96366	recall: 0.63158	precision: 0.53731


In [24]:
evaluate("icpsr_test_0shots.jsonl.gpt-4.result.jsonl")

f1: 0.49057	acc: 0.92453	recall: 0.91228	precision: 0.33548


In [25]:
evaluate("icpsr_test_10shots.jsonl.gpt-4.result.jsonl")

f1: 0.79389	acc: 0.98113	recall: 0.91228	precision: 0.70270


## evaluate ARTS Multilabel

In [28]:
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
import re
import json
def evaluate(file):
    df = pd.read_json(file, lines=True)
    y_pred, y_true = [], []
    for _, row in df.iterrows():
        label = row['label']
        res = row['response']
        pattern = r'{[^}]*}'
        matches = re.findall(pattern, res)
        try:
            res = json.loads(matches[0].strip())
        except:
            print(res)
            continue
        if res['answer'] == "NULL":
            pred = 0
        elif res['answer'] == "equivalent":
            pred = 1
        elif res['answer'] == "hypernymy":
            pred = 2
        else:
            continue
        y_true.append(label)
        y_pred.append(pred)
    f1 = f1_score(y_true, y_pred, average="macro")
    # acc = accuracy_score(y_true, y_pred, average="macro")
    rc = recall_score(y_true, y_pred, average="macro")
    prec = precision_score(y_true, y_pred, average="macro")
    print("macro f1: {:.5f}\tmacro recall: {:.5f}\tmacro precision: {:.5f}".format(f1, rc, prec))

In [29]:
evaluate("arts_multilabel_test_w_sample_values_0shots.jsonl.gpt-3.5-turbo-1106.result.jsonl")

macro f1: 0.68022	macro recall: 0.64246	macro precision: 0.73165


In [30]:
evaluate("arts_multilabel_test_w_sample_values_10shots.jsonl.gpt-3.5-turbo-1106.result.jsonl")

macro f1: 0.55565	macro recall: 0.50958	macro precision: 0.64749


In [31]:
evaluate("arts_multilabel_test_w_sample_values_0shots.jsonl.gpt-4.result.jsonl")

macro f1: 0.82329	macro recall: 0.88910	macro precision: 0.78123


In [32]:
evaluate("arts_multilabel_test_w_sample_values_10shots.jsonl.gpt-4.result.jsonl")

macro f1: 0.87390	macro recall: 0.94211	macro precision: 0.83159


In [33]:
evaluate("arts_multilabel_test_0shots.jsonl.gpt-3.5-turbo-1106.result.jsonl")

macro f1: 0.87962	macro recall: 0.90712	macro precision: 0.85722


In [34]:
evaluate("arts_multilabel_test_0shots.jsonl.gpt-4.result.jsonl")

macro f1: 0.85677	macro recall: 0.97075	macro precision: 0.80564


In [35]:
evaluate("arts_multilabel_test_10shots.jsonl.gpt-3.5-turbo-1106.result.jsonl")

macro f1: 0.83740	macro recall: 0.86702	macro precision: 0.81889


In [37]:
evaluate("arts_multilabel_test_10shots.jsonl.gpt-4.result.jsonl")

macro f1: 0.90880	macro recall: 0.98434	macro precision: 0.86498
