In [1]:
import json
import tqdm

from langchain import PromptTemplate
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

from utils import load_pickle_file, save_pickle_file, map_answers_column, calculate_f1_scores, decimal, load_cta_dataset_column
from sklearn.metrics.pairwise import cosine_similarity

import os
os.environ["CUDA_VISIBLE_DEVICES"]="4" #4,5,6,7

In [None]:
datasets = ["sotabv2", "t2dv2-webtables", "sportstables"]

# StableBeluga7B
model_name = "stabilityai/StableBeluga-7B"
mod = "stablebeluga7b"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="hf_cache/")
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto", cache_dir="hf_cache/")

# SOLAR
# model_name = "upstage/SOLAR-0-70b-16bit"
# mod = "solar"
# tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="hf_cache/")
# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, load_in_8bit=True, device_map="auto", cache_dir="hf_cache/", temperature=0, do_sample=True)

In [None]:
zero_template = """Answer the question based on the task, instructions and definitions below.
Definitions: 
{definitions}
Task: Classify the column given to you into only one of these classes that are separated with comma: {labels_joined}.
Instructions: 1. Look at the column and the classes given to you. 2. Examine the values of the column. 3. Select a class that best represents the meaning of the column. 4. Answer with the selected class.
Column: {input_string}
Class:"""

## Manual definitions

In [None]:
for dataset in datasets:
    examples, labels, train_examples, train_labels, labels_to_text, text_to_label, labels_joined, train, test = load_cta_dataset_column(dataset,"")

    f = open(f'../data/{dataset}-definitions.txt')
    definitions = json.load(f)
    all_labels = [labels_to_text[defn] for defn in definitions]
    definitions = [definitions[defn] for defn in definitions]
    test_embeddings = load_pickle_file(f'embeddings/test_embeddings_{dataset}-column.pkl')
    
    prompt = PromptTemplate(template=zero_template, input_variables=['input_string', 'labels_joined', 'definitions'])

    # Pick the necessary deinitions for each example
    knowledge_embeddings = load_pickle_file(f"embeddings/{dataset}-definitions-embeddings.pkl")
    examples_demonstrations = []
    for i, example in tqdm.tqdm(enumerate(examples), total=len(examples)):
        cos = cosine_similarity([test_embeddings[i]], knowledge_embeddings)
        cos_dict = {}
        for j, c in enumerate(cos[0]):
            cos_dict[j] = c
        sorted_cos_dict = {k: v for k, v in sorted(cos_dict.items(), key=lambda item: item[1])}
        examples_demonstrations.append(list(sorted_cos_dict.keys())[-10:])

    prompts = []
    model_answers = []

    for j, example in tqdm.tqdm(enumerate(examples), total=len(examples)):
        
        try:
            definitions_string = """"""
            for i in examples_demonstrations[j][-5:]:
                definitions_string += f"{all_labels[i]}: {definitions[i]}\n"
            definitions_string = definitions_string.strip()

            text_prompt = prompt.format(input_string=example.strip(), labels_joined=labels_joined, definitions=definitions_string)
            prompts.append(text_prompt)
            
            inputs = tokenizer(text_prompt, return_tensors="pt").to("cuda")
            output = model.generate(**inputs, do_sample=True, top_p=0.95, top_k=0, max_new_tokens=100)
            model_answers.append(tokenizer.decode(output[0], skip_special_tokens=True))
        except Exception:
            definitions_string = """"""
            for i in examples_demonstrations[j][-3:]:
                definitions_string += f"{all_labels[i]}: {definitions[i]}\n"
            definitions_string = definitions_string.strip()

            text_prompt = prompt.format(input_string=example.strip(), labels_joined=labels_joined, definitions=definitions_string)
            prompts.append(text_prompt)

            inputs = tokenizer(text_prompt, return_tensors="pt").to("cuda")
            output = model.generate(**inputs, do_sample=True, top_p=0.95, top_k=0, max_new_tokens=100)
            model_answers.append(tokenizer.decode(output[0], skip_special_tokens=True))

    save_pickle_file(f"predictions/{dataset}/{mod}/manual-definitions-prompt-column-0-shot.pkl", model_answers)
    save_pickle_file(f"predictions/{dataset}/{mod}/manual-definitions-prompt-column-0-shot-prompts.pkl", prompts)

## LLM Definitions experiments

In [13]:
a_prompts = ["A1", "A2", "A3", "A4", "A5", "A6"]
tb_prompts = ["TB1", "TB2", "TB3", "TB4", "TB5", "TB6", "TB7"]
inst_prompts = ["I1", "I2"]
syst_prompts = ["S1", "S2", "S3", "S4", "S5"]

In [None]:
for dataset in datasets:
    examples, labels, train_examples, train_labels, labels_to_text, text_to_label, labels_joined, train, test = load_cta_dataset_column(dataset,"")
    all_labels = [labels_to_text[l] for l in labels_to_text]
    
    test_embeddings = load_pickle_file(f"embeddings/test_embeddings_{dataset}-column.pkl")
    
    for syst in syst_prompts:
        for g in a_prompts:
            if f"{syst}_{g}-column-0-shot.pkl" not in os.listdir(f"predictions/{dataset}/{mod}/"):
                if f"{syst}_{g}_prompt_knowledge.pkl" in os.listdir(f"knowledge/{mod}/{dataset}/"):
                    print(f"Loading knowledge {syst}_{g}")
                    definitions = load_pickle_file(f"knowledge/{mod}/{dataset}/{syst}_{g}_prompt_knowledge.pkl")
                    prompts = load_pickle_file(f"knowledge/{mod}/{dataset}/{syst}_{g}_prompt_knowledge-prompts.pkl")
                    definitions = [d.replace(prompts[i],"").strip() for i, d in enumerate(definitions)]
                    knowledge_embeddings = load_pickle_file(f"embeddings/{mod}/{syst}_{g}_knowledge_embeddings_{dataset}.pkl")
                    
                    examples_demonstrations = []
                    for i, example in tqdm.tqdm(enumerate(examples), total=len(examples)):
                        cos = cosine_similarity([test_embeddings[i]], knowledge_embeddings)
                        cos_dict = {}
                        for j, c in enumerate(cos[0]):
                            cos_dict[j] = c
                        sorted_cos_dict = {k: v for k, v in sorted(cos_dict.items(), key=lambda item: item[1])}
                        examples_demonstrations.append(list(sorted_cos_dict.keys())[-10:])

                    prompt = PromptTemplate(template=zero_template, input_variables=['input_string', 'labels_joined', 'definitions'])
                    prompts = []
                    model_answers = []

                    for j, example in tqdm.tqdm(enumerate(examples), total=len(examples)):

                        try:
                            definitions_string = """"""
                            for i in examples_demonstrations[j][-5:]:
                                definitions_string += f"{all_labels[i]}: {definitions[i]}\n"
                            definitions_string = definitions_string.strip()

                            text_prompt = prompt.format(input_string=example.strip(), labels_joined=labels_joined, definitions=definitions_string)

                            inputs = tokenizer(text_prompt, return_tensors="pt").to("cuda")
                            output = model.generate(**inputs, do_sample=True, top_p=0.95, top_k=0, max_new_tokens=256)
                            prompts.append(text_prompt)
                            model_answers.append(tokenizer.decode(output[0], skip_special_tokens=True))

                        except Exception:
                            definitions_string = """"""
                            for i in examples_demonstrations[j][-3:]:
                                definitions_string += f"{all_labels[i]}: {definitions[i]}\n"
                            definitions_string = definitions_string.strip()

                            text_prompt = prompt.format(input_string=example.strip(), labels_joined=labels_joined, definitions=definitions_string)

                            inputs = tokenizer(text_prompt, return_tensors="pt").to("cuda")
                            output = model.generate(**inputs, do_sample=True, top_p=0.95, top_k=0, max_new_tokens=256)
                            prompts.append(text_prompt)
                            model_answers.append(tokenizer.decode(output[0], skip_special_tokens=True))

                    save_pickle_file(f"predictions/{dataset}/{mod}/{syst}_{g}-column-0-shot.pkl", model_answers)
                    save_pickle_file(f"predictions/{dataset}/{mod}/{syst}_{g}-column-0-shot-prompts.pkl", prompts)

    # Run B prompts             
    for tb in tb_prompts:
        for syst in syst_prompts[:-1]: # skip last system for prompt b
            for inst in inst_prompts:
                if f"{syst}_{inst}_{tb}-column-0-shot.pkl" not in os.listdir(f"predictions/{dataset}/{mod}/") and f"{syst}_{inst}_{tb}_prompt_knowledge.pkl" in os.listdir(f"knowledge/{mod}/{dataset}/"):
                    if f"{syst}_{inst}_{tb}_prompt_knowledge.pkl" in os.listdir(f"knowledge/{mod}/{dataset}/"):
                        print(f"Loading knowledge {tb}_{syst}_{inst}")
                        definitions = load_pickle_file(f"knowledge/{mod}/{dataset}/{syst}_{inst}_{tb}_prompt_knowledge.pkl")
                        prompts = load_pickle_file(f"knowledge/{mod}/{dataset}/{syst}_{inst}_{tb}_prompt_knowledge-prompts.pkl")
                        definitions = [d.replace(prompts[i],"").strip() for i, d in enumerate(definitions)]
                        knowledge_embeddings = load_pickle_file(f"embeddings/{mod}/{syst}_{inst}_{tb}_knowledge_embeddings_{dataset}.pkl")

                        examples_demonstrations = []
                        for i, example in enumerate(examples):
                            cos = cosine_similarity([test_embeddings[i]], knowledge_embeddings)
                            cos_dict = {}
                            for j, c in enumerate(cos[0]):
                                cos_dict[j] = c
                            sorted_cos_dict = {k: v for k, v in sorted(cos_dict.items(), key=lambda item: item[1])}
                            examples_demonstrations.append(list(sorted_cos_dict.keys())[-10:])

                        prompt = PromptTemplate(template=zero_template, input_variables=['input_string', 'labels_joined', 'definitions'])
                        prompts = []
                        model_answers = []

                        for j, example in tqdm.tqdm(enumerate(examples), total=len(examples)):

                            try:
                                definitions_string = """"""
                                for i in examples_demonstrations[j][-5:]:
                                    definitions_string += f"{all_labels[i]}: {definitions[i]}\n"
                                definitions_string = definitions_string.strip()

                                text_prompt = prompt.format(input_string=example.strip(), labels_joined=labels_joined, definitions=definitions_string)

                                inputs = tokenizer(text_prompt, return_tensors="pt").to("cuda")
                                output = model.generate(**inputs, do_sample=True, top_p=0.95, top_k=0, max_new_tokens=256)
                                prompts.append(text_prompt)
                                model_answers.append(tokenizer.decode(output[0], skip_special_tokens=True))

                            except Exception:
                                definitions_string = """"""
                                for i in examples_demonstrations[j][-5:]:
                                    definitions_string += f"{all_labels[i]}: {definitions[i]}\n"
                                definitions_string = definitions_string.strip()

                                text_prompt = prompt.format(input_string=example.strip(), labels_joined=labels_joined, definitions=definitions_string)

                                inputs = tokenizer(text_prompt, return_tensors="pt").to("cuda")
                                output = model.generate(**inputs, do_sample=True, top_p=0.95, top_k=0, max_new_tokens=256)
                                prompts.append(text_prompt)
                                model_answers.append(tokenizer.decode(output[0], skip_special_tokens=True))

                        save_pickle_file(f"predictions/{dataset}/{mod}/{syst}_{inst}_{tb}-column-0-shot.pkl", model_answers)
                        save_pickle_file(f"predictions/{dataset}/{mod}/{syst}_{inst}_{tb}-column-0-shot-prompts.pkl", prompts)

## Evaluation

In [None]:
for syst in syst_prompts:
    for g in a_prompts:
        if f"{syst}_{g}-column-0-shot.pkl" in os.listdir(f"predictions/{dataset}/{model}/"):
            preds = load_pickle_file(f"predictions/{dataset}/{model}/{syst}_{g}-column-0-shot.pkl")
            prompts = load_pickle_file(f"predictions/{dataset}/{model}/{syst}_{g}-column-0-shot-prompts.pkl")

            predictions, num = map_answers_column(preds,prompts)

            types = list(set(labels))
            types = types + ["-"] if "-" in predictions else types
            evaluation, per_class_eval = calculate_f1_scores(labels, predictions, len(types), types)

            print(f"{syst}_{g}\t{decimal(evaluation['Precision'])}\t{decimal(evaluation['Recall'])}\t{decimal(evaluation['Macro-F1'])}\t{decimal(evaluation['Micro-F1'])}\t{num}")  

for tb in tb_prompts:
    for syst in syst_prompts[:-1]: # skip last system for prompt b
        for inst in inst_prompts:
            if f"{syst}_{inst}_{tb}-column-0-shot.pkl" in os.listdir(f"predictions/{dataset}/{model}/"):
                preds = load_pickle_file(f"predictions/{dataset}/{model}/{syst}_{inst}_{tb}-column-0-shot.pkl")
                prompts = load_pickle_file(f"predictions/{dataset}/{model}/{syst}_{inst}_{tb}-column-0-shot-prompts.pkl")

                predictions, num = map_answers_column(preds,prompts)

                types = list(set(labels))
                types = types + ["-"] if "-" in predictions else types
                evaluation, per_class_eval = calculate_f1_scores(labels, predictions, len(types), types)

                print(f"{syst}_{inst}_{tb}\t{decimal(evaluation['Precision'])}\t{decimal(evaluation['Recall'])}\t{decimal(evaluation['Macro-F1'])}\t{decimal(evaluation['Micro-F1'])}\t{num}")  