In [None]:
import os
from dotenv import dotenv_values
from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage, AIMessage
from utils import map_cpa_to_labels, calculate_f1_scores, save_pickle_file, load_cpa_dataset_column, load_pickle_file, load_cpa_dataset, decimal, map_answers_column
import tqdm
import random
from langchain.embeddings.openai import OpenAIEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
import multiprocessing

In [None]:
# Load env file with API KEY using full path
config = dotenv_values("/full/path/to/file/key.env")
os.environ['OPENAI_API_KEY'] = config["OPENAI_API_KEY"]
OPENAI_API_KEY = config["OPENAI_API_KEY"]

In [None]:
datasets = ["sotabv2", "t2dv2-webtables"]
models = ["gpt-3.5-turbo-0125", "gpt-4-0125-preview"]

## Select most similar demonstrations for each test example

In [None]:
# Load embedding model
model_name = 'text-embedding-ada-002'
embed = OpenAIEmbeddings(model=model_name, openai_api_key=OPENAI_API_KEY)

def top_10_indices(index):
    cos = cosine_similarity([test_embeddings[index]], train_embeddings)
    
    # Arrange cosine similarity in dictionary
    cos_dict = {}
    for j, c in enumerate(cos[0]):
        cos_dict[j] = c
    
    # Sort dictionary
    sorted_cos_dict = {k: v for k, v in sorted(cos_dict.items(), key=lambda item: item[1])}
    
    # Retrieve the 10 most similar indices for each test example
    return list(sorted_cos_dict.keys())[-10:]

In [None]:
for dataset in datasets:
    print(dataset)
    # Load dataset
    examples, labels, test_table_type_labels, train_examples, train_example_labels, train_table_type_labels, labels_to_text, text_to_label, labels_joined, train, test = load_cpa_dataset(dataset,"-markdown-20",False)

    test_embeddings = embed.embed_documents(examples)
    train_embeddings = embed.embed_documents(train_examples)
    
    # Retrieve top 10 indices for each test label
    pool = multiprocessing.Pool(processes=4)
    examples_demonstrations = list(tqdm.tqdm(pool.imap(top_10_indices, range(len(test))), total=len(test)))
    pool.close()
    pool.join()
    # Save most similar training examples to test examples
    save_pickle_file(f"embeddings/cpa-examples_demonstrations_{dataset}-20.pkl", examples_demonstrations)

## Table-prompts experiments

In [None]:
tasks = {
    "": "Your task is to classify the relationship between two columns of a given table with one of the following relationships that are separated with comma:",
    "-cpa": "Your task is to perform column property annotation (CPA), meaning that your task is to annotate the relationship between the leftmost column (name column) and a second column of a given table with only one of the following relationships that are separated with comma:",
}

In [None]:
instructions = {
    "": "Your instructions are: 1. Look at the input given to you and make a table out of it. 2. Look at the cell values in detail. 3. For each column, select a relationship that best represents the relationship between that column and the first column of the table. 4. Answer with only one selected relationship for each column with the format Column 2: relationship. Don't return any relationship for the first column! 5. Answer only with labels from the provided label set!",
    "-less-instructions": "Your instructions are: 1. For each column, select a relationship from the list that best represents the relationship between that column and the first column of the table. 2. Answer with only one selected relationship for each column with the format Column 2: relationship. Don't return any relationship for the first column! 3. Answer only with labels from the provided label set!",
}

In [None]:
last_message = {
    "": "Classify these table columns:",
    "-annotate": "Please annotate the columns of the following table:",
    "-determine": "Please determine the relationships for columns of this table:",
    "-relationships": "Please classify the relationships between the first column and the other columns of this table:",
}

In [None]:
# Zero-shot
for dataset in datasets:
    print(dataset)
    # Load dataset
    examples, labels, test_table_type_labels, train_examples, train_example_labels, train_table_type_labels, labels_to_text, text_to_label, labels_joined, train, test = load_cpa_dataset(dataset,"-markdown-20",False)

    for model_name in models[:2]:
        print(model_name)
        chat = ChatOpenAI(openai_api_key=OPENAI_API_KEY, temperature=0, model=model_name)
        
        for nr in [0]:
            print(nr)
            for task in tasks:
                for instruction in instructions:
                    for mess in last_message:
                        print(f"cpa-chat-table-{nr}-shot{task}{instruction}{mess}")
                        preds = []
                        
                        #For each combination run prediction
                        for example in tqdm.tqdm(examples, total=len(examples)):
                            messages = []
                            messages.append(SystemMessage(content=f"{tasks[task]} {labels_joined}."))
                            messages.append(SystemMessage(content=instructions[instruction]))

                            for i in range(0,nr):
                                index = random.randint(0, len(train_examples)-1)
                                messages.append(HumanMessage(content=f"{last_message[mess]}\n{train_examples[index]}"))
                                messages.append(AIMessage(content=f"{train_example_labels[index]}"))

                            messages.append(HumanMessage(content=f"{last_message[mess]}\n{example}"))
                            
                            res = chat(messages)
                            preds.append(res.content)
                        save_pickle_file(f"predictions/{dataset}/{model_name}/cpa-chat-table-{nr}-shot-markdown{task}{instruction}{mess}.pkl", preds)

In [None]:
# Few-shot
for dataset in datasets:
    print(dataset)
    # Load dataset
    examples, labels, test_table_type_labels, train_examples, train_example_labels, train_table_type_labels, labels_to_text, text_to_label, labels_joined, train, test = load_cpa_dataset(dataset,"-markdown-20",False)
    examples_demonstrations = load_pickle_file(f"embeddings/cpa-examples_demonstrations_{dataset}-20.pkl")

    for model_name in models[:2]:
        print(model_name)
        chat = ChatOpenAI(openai_api_key=OPENAI_API_KEY, temperature=0, model=model_name)
        
        # Few-shot: random
        for nr in [1, 3, 5]:
            print(nr)
            preds = []

            for example in tqdm.tqdm(examples, total=len(examples)):
                messages = []
                
                # Less instructions
                messages.append(SystemMessage(content=f"Your task is to classify the relationship between two columns of a given table with one of the following relationships that are separated with comma: {labels_joined}."))
                messages.append(SystemMessage(content="Your instructions are: 1. For each column, select a relationship from the list that best represents the relationship between that column and the first column of the table. 2. Answer with only one selected relationship for each column with the format Column 2: relationship. Don't return any relationship for the first column! 3. Answer only with labels from the provided label set!"))
                

                for i in range(0,nr):
                    index = random.randint(0, len(train_examples)-1)
                    messages.append(HumanMessage(content=f"Please classify the relationships between the first column and the other columns of this table:\n{train_examples[index]}"))
                    messages.append(AIMessage(content=f"{train_example_labels[index]}"))
                messages.append(HumanMessage(content=f"Please classify the relationships between the first column and the other columns of this table:\n{example}"))
   
                res = chat(messages)
                preds.append(res.content)
            save_pickle_file(f"predictions/{dataset}/{model_name}/cpa-chat-table-{nr}-shot-markdown-20-less-instructions-relationships.pkl", preds)

        # Few-shot: similar
        for nr in [1, 3, 5]:
            print(nr)
            preds = []

            for i, example in tqdm.tqdm(enumerate(examples), total=len(examples)):
                messages = []
                messages.append(SystemMessage(content=f"Your task is to classify the relationship between two columns of a given table with one of the following relationships that are separated with comma: {labels_joined}."))
                messages.append(SystemMessage(content="Your instructions are: 1. For each column, select a relationship from the list that best represents the relationship between that column and the first column of the table. 2. Answer with only one selected relationship for each column with the format Column 2: relationship. Don't return any relationship for the first column! 3. Answer only with labels from the provided label set!"))
                
                for index in examples_demonstrations[i][-nr:]:
                    messages.append(HumanMessage(content=f"Please classify the relationships between the first column and the other columns of this table:\n{train_examples[index]}"))
                    messages.append(AIMessage(content=f"{train_example_labels[index]}"))

                messages.append(HumanMessage(content=f"Please classify the relationships between the first column and the other columns of this table:\n{example}"))
                res = chat(messages)
                preds.append(res.content)
            save_pickle_file(f"predictions/{dataset}/{model_name}/cpa-chat-table-{nr}-similar-shot-markdown-20-less-instructions-relationships.pkl", preds)


## Evaluation

In [None]:
# Table prompt
for nr in [0, 1, 3, 5, "3-similar","5-similar"]:
    preds = load_pickle_file(f"predictions/{dataset}/{model_name}/cpa-chat-table-{nr}-shot.pkl")
    predictions, num = map_cpa_to_labels(preds, test, text_to_label)
    types = list(set(labels))
    types = types + ["-"] if '-' in predictions else types
    evaluation, per_class_eval = calculate_f1_scores(labels, predictions, len(types), types)
    print(f"{decimal(evaluation['Precision'])}\t{decimal(evaluation['Recall'])}\t{decimal(evaluation['Macro-F1'])}\t{decimal(evaluation['Micro-F1'])}\t{num}")

## Error Analysis

In [None]:
errors = 0
errors_per_class = {}
for i in range(len(predictions)):
    if predictions[i] == labels[i]:
        errors += 1
        print(table_indices[i])
        print(test[table_indices[i]][1])
        print(test[table_indices[i]][2])
        print(f"Predicted as {predictions[i]} when it was {labels[i]}")
        if labels[i] not in errors_per_class:
            errors_per_class[labels[i]] = 0
        errors_per_class[labels[i]] +=1
errors