In [None]:
import os
from dotenv import dotenv_values
from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage, AIMessage
from utils import map_cpa_to_labels, calculate_f1_scores, save_pickle_file, load_cpa_dataset_column, load_pickle_file, load_cpa_dataset, decimal, map_answers_column, map_cta_labels,load_cta_dataset
import tqdm
import random

In [None]:
# Load env file with API KEY using full path
config = dotenv_values("/full/path/to/file/key.env")
os.environ['OPENAI_API_KEY'] = config["OPENAI_API_KEY"]
OPENAI_API_KEY = config["OPENAI_API_KEY"]

In [None]:
# Start fine-tuning job
# Example: fine-tune on cpa task only:
from openai import OpenAI
client = OpenAI()

client.fine_tuning.jobs.create(
  training_file="data/ft-data/cpa-ft/cpa-train-20-ft.jsonl",
  validation_file="data/ft-data/cpa-ft/cpa-val-ft.jsonl",
  model="gpt-3.5-turbo-0613"
)

In [None]:
# fine-tuned model names as keys
fine_tuning_models = {
    "": "cta-ft-gpt-3.5-turbo-0613",
    "": "cpa-ft-gpt-3.5-turbo-0613",
    "": "ctacpa-ft-gpt-3.5-turbo-0613",
    "": "ctacpa-small-ft-gpt-3.5-turbo-0613",
} 

### Test CPA task with fine-tuned models

In [None]:
# Zero-shot
for dataset in datasets:
    print(dataset)
    # Load dataset
    examples, labels, test_table_type_labels, train_examples, train_example_labels, train_table_type_labels, labels_to_text, text_to_label, labels_joined, train, test = load_cpa_dataset(dataset,"-markdown-20",False)

    for model_name in ft_models:
        print(model_name)
        chat = ChatOpenAI(openai_api_key=OPENAI_API_KEY, temperature=0, model=model_name)
        
        for nr in [0]:
            print(nr)
            for task in tasks:
                for instruction in instructions:
                    for mess in last_message:
                        print(f"cpa-chat-table-{nr}-shot{task}{instruction}{mess}")
                        preds = []
                        
                        #For each combination run prediction
                        for example in tqdm.tqdm(examples, total=len(examples)):
                            messages = []
                            messages.append(SystemMessage(content=f"Your task is to classify the relationship between two columns of a given table with one of the following relationships that are separated with comma: {labels_joined}."))
                            messages.append(SystemMessage(content="Your instructions are: 1. For each column, select a relationship from the list that best represents the relationship between that column and the first column of the table. 2. Answer with only one selected relationship for each column with the format Column 2: relationship. Don't return any relationship for the first column! 3. Answer only with labels from the provided label set!"))

                            for i in range(0,nr):
                                index = random.randint(0, len(train_examples)-1)
                                messages.append(HumanMessage(content=f"Please classify the relationships between the first column and the other columns of this table:\n{train_examples[index]}"))
                                messages.append(AIMessage(content=f"{train_example_labels[index]}"))

                            messages.append(HumanMessage(content=f"Please classify the relationships between the first column and the other columns of this table:\n{example}"))
                            
                            res = chat(messages)
                            preds.append(res.content)
                        save_pickle_file(f"predictions/{dataset}/{ft_models[model_name]}/cpa-chat-table-{nr}-shot-markdown.pkl", preds)

In [None]:
# Evaluation
for nr in [0]:
    preds = load_pickle_file(f"predictions/{dataset}/{ft_models[model_name]}/cpa-chat-table-{nr}-shot-markdown.pkl")
    predictions, num = map_cpa_to_labels(preds, test, text_to_label)
    types = list(set(labels))
    types = types + ["-"] if '-' in predictions else types
    evaluation, per_class_eval = calculate_f1_scores(labels, predictions, len(types), types)
    print(f"{decimal(evaluation['Precision'])}\t{decimal(evaluation['Recall'])}\t{decimal(evaluation['Macro-F1'])}\t{decimal(evaluation['Micro-F1'])}\t{num}")

### Test CTA task with fine-tuned models

In [None]:
for dataset in datasets:
    print(dataset)
    # Load dataset
    examples, labels, test_table_type_labels, train_examples, train_example_labels, train_table_type_labels, labels_to_text, text_to_label, labels_joined, train, test = load_cta_dataset(dataset,"-markdown-20")

    for model_name in ft_models:
        print(ft_models[model_name])
        chat = ChatOpenAI(openai_api_key=OPENAI_API_KEY, temperature=0, model=model_name)

        #Zero-shot
        for nr in [0]:
            preds = []

            for j, example in tqdm.tqdm(enumerate(examples), total=len(examples)):
                messages = []
                messages.append(SystemMessage(content=f"You are a world-class data engineer and your task is to annotate the columns of a given table with only one of the following labels that are separated with comma: {labels_joined}.")) #labels_in_prompts[j]
                messages.append(SystemMessage(content="Your instructions are: 1. Look at the input given to you and make a table out of it. 2. Look at the cell values in detail. 3. For the required columns, select a label that best represents the meaning of all cells in the column. 4. Answer with the selected label for each column using the format Column 1: label. 5. Answer only with labels from the provided label set!"))

                for i in range(0,nr):
                    index = random.randint(0, len(train_examples)-1)
                    messages.append(HumanMessage(content=f"Classify these table columns:\n{train_examples[index]}"))
                    messages.append(AIMessage(content=f"{train_example_labels[index]}"))

                messages.append(HumanMessage(content=f"Classify these table columns:\n{example}"))
                res = chat(messages)
                preds.append(res.content)
            save_pickle_file(f"predictions/{dataset}/{ft_models[model_name]}/chat-table-{nr}-shot-markdown.pkl", preds)

In [None]:
# Evaluation
for nr in [0]:
    preds = load_pickle_file(f"predictions/{dataset}/{ft_models[model_name]}/chat-table-{nr}-shot-markdown.pkl")
    predictions, num, oov_indices, oov, oov_table_indices, oov_tablecolumn_indices, _, _ = map_cta_labels(preds, test, text_to_label)
    labels = [l for l in labels if l!=""]
    
    types = list(set(labels))
    types = types + ["-"] if '-' in predictions else types
    evaluation, per_class_eval = calculate_f1_scores(labels, predictions, len(types), types)
    print(f"{decimal(evaluation['Precision'])}\t{decimal(evaluation['Recall'])}\t{decimal(evaluation['Macro-F1'])}\t{decimal(evaluation['Micro-F1'])}\t{num}")