In [1]:
few_shot_path = '/scratch/dzhang5/LLM/TWEET-FID/expert.train.short.csv'
data_path = '/scratch/dzhang5/LLM/TWEET-FID/expert.smalltest.csv'
output_dir = '/scratch/dzhang5/LLM/TWEET-FID/test-results-autolabel-all-ae'
model_name = "gpt-3.5-turbo"
text_column = 'context'
explanation_column = 'entity_explanation'
example_selection_label_column = 'has_all'
label_symbol = "^^^^"
few_shot_num = 8
few_shot_selection = 'label_diversity_similarity'
use_current_explanation = False
use_ground_explanation = False
token_path = "/home/dzhang5/.cache/huggingface/token"
cache=False
console_output=True
temperature=0.1
label_version='v1'
task_version='v1'
random_shuffle_examples = True
random_shuffle_examples_seed = 1

# Generation Stage

In [2]:
from autolabel.schema import ModelProvider, TaskType
from autolabel.models import register_model, MODEL_REGISTRY
from hf_pipeline_new import HFPipelineLLMNew
from few_shot_new import NewAutoLabelConfig, NewExampleSelectorFactory
from autolabel.few_shot import ExampleSelectorFactory
from template_inst import update_inst_mode
from named_entity_recognition_new import NewNamedEntityRecognitionTask
from classification_new import NewClassificationTask
from question_answering_new import NewQuestionAnsweringTask
from attribute_extraction_new import NewAttributeExtractionTask
from autolabel.tasks import TASK_TYPE_TO_IMPLEMENTATION 
from prompt_template import load_all_in_one_prompt
from dataset_new import process_labels
import sys

In [3]:
update_inst_mode(model_name)
TASK_TYPE_TO_IMPLEMENTATION[TaskType.NAMED_ENTITY_RECOGNITION] = NewNamedEntityRecognitionTask
TASK_TYPE_TO_IMPLEMENTATION[TaskType.CLASSIFICATION] = NewClassificationTask
TASK_TYPE_TO_IMPLEMENTATION[TaskType.QUESTION_ANSWERING] = NewQuestionAnsweringTask
TASK_TYPE_TO_IMPLEMENTATION[TaskType.ATTRIBUTE_EXTRACTION] = NewAttributeExtractionTask
sys.modules['autolabel.labeler'].ExampleSelectorFactory = NewExampleSelectorFactory
sys.modules['autolabel.dataset'].AutolabelDataset.process_labels = process_labels
register_model(ModelProvider.HUGGINGFACE_PIPELINE, HFPipelineLLMNew)

In [4]:
from autolabel import LabelingAgent, AutolabelDataset
import json
import os
import pandas as pd

In [5]:
with open(token_path) as tfile:
    token_str = tfile.read()

from huggingface_hub import login
login(token=token_str)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/dzhang5/.cache/huggingface/token
Login successful


In [6]:
no_auto = ["microsoft/prophetnet", "microsoft/phi-2", "google/pegasus-x"]
if any([pre in model_name for pre in no_auto]):
    device_map = None
else:
    device_map = "auto"

In [7]:
if not os.path.exists(output_dir):
    # Create the directory
    os.makedirs(output_dir)
output_name = os.path.split(model_name)[-1] + '_' + few_shot_selection + '_COT_AE_' + str(explanation_column) + '_cur_' + str(use_current_explanation) + '_ground_' + str(use_ground_explanation) + '_' + os.path.split(data_path)[-1]
output_path = os.path.join(output_dir, output_name)

In [8]:
refuel_models = ["refuel-llm", "llama-13b-chat"]
if model_name in refuel_models:
    provider = "refuel"
    em_provider = "huggingface_pipeline"
    model_params = {"max_length":4096, "temperature": temperature}
    task_name = f"FoodborneIllnessIncidentTweetNERAE_{few_shot_selection}_{model_name}"
elif model_name.startswith('gpt'):
    provider = "openai"
    em_provider = "openai"
    model_params = {"max_tokens":4096, "temperature": temperature}
    task_name = f"FoodborneIllnessIncidentTweetNERAE_{few_shot_selection}_{model_name}"
else:
    provider = "huggingface_pipeline"
    em_provider = "huggingface_pipeline"
    model_params = {"max_length":4096, "temperature": temperature,
                    "quantize": 16, "device_map": device_map,
                    "token": token_str}
    task_name = f"FoodborneIllnessIncidentTweetNERAE_{few_shot_selection}_{model_name.split('/')[1]}"

In [10]:
sym_len = len(label_symbol)
label_prefix, label_suffix, label_description, task_guideline, output_guideline = load_all_in_one_prompt(label_symbol, label_version, task_version, True)

In [11]:
test_data = pd.read_csv(data_path)

In [12]:
if use_current_explanation and not use_ground_explanation:
    explanation_column = explanation_column + "_prediction"

In [13]:
attribute_description = [{'name': k+'_answer', 'description': f'{k} entities {v}'} for k, v in label_description.items()]

In [15]:
attribute_description.append({'name':'sentence_class', 'description': 'If the text describes a potential foodborne illness event','options':['Yes', 'No']})

In [16]:
config = {
    "task_name": task_name,
    "task_type": "attribute_extraction",
    "dataset": {
        "text_column": text_column,
        "explanation_column": explanation_column,
        "example_selection_label_column": example_selection_label_column,
        "delimiter": ",",
        "label_description": label_description 
    },
    "model": {
        "provider": provider,
        "name": model_name,
        "params": model_params
    },
    "embedding": {
        "provider": em_provider,
    },
    "prompt": {
        "task_guidelines": task_guideline,
        "output_guidelines": output_guideline,
        "attributes": attribute_description,
        "example_selection_labels":[
            "yes",
            "no"
        ],
        "few_shot_examples": few_shot_path,
        "few_shot_selection": few_shot_selection,
        "few_shot_num": few_shot_num,
        "use_current_explanation": use_current_explanation,
        "random_shuffle_examples": random_shuffle_examples,
        "random_shuffle_examples_seed": random_shuffle_examples_seed,
        "example_template": f"Input: {{{text_column}}}\nOutput: Let's think step by step.\n{{{explanation_column}}}\n"+"{output_dict}",
        "chain_of_thought": True
    }
}

config = NewAutoLabelConfig(config)

In [17]:
agent = LabelingAgent(config=config, console_output=console_output, cache=cache)

  warn_deprecated(


In [18]:
label_columns = [i['name'] for i in attribute_description]

In [19]:
if use_current_explanation:
    ds = AutolabelDataset(test_data[[text_column, explanation_column, example_selection_label_column] + label_columns], config=config)
else:
    ds = AutolabelDataset(test_data[[text_column, example_selection_label_column] + label_columns], config=config)

In [20]:
agent.plan(ds)

  warn_deprecated(
2024-04-04 22:59:53 httpx INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-04-04 23:00:00 httpx INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-04-04 23:00:05 httpx INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Output()

2024-04-04 23:00:08 httpx INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-04-04 23:00:09 httpx INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-04-04 23:00:09 httpx INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-04-04 23:00:09 httpx INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-04-04 23:00:10 httpx INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-04-04 23:00:10 httpx INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-04-04 23:00:10 httpx INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-04-04 23:00:11 httpx INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-04-04 23:00:11 httpx INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-04-04 23:00:11 httpx INFO: HTTP Request: 

In [21]:
# now, do the actual labeling
ds = agent.run(ds)

Output()

2024-04-04 23:01:04 httpx INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-04-04 23:01:09 httpx INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-04-04 23:01:09 httpx INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-04-04 23:01:12 httpx INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-04-04 23:01:13 httpx INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-04-04 23:01:18 httpx INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-04-04 23:01:18 httpx INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-04-04 23:01:20 httpx INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-04-04 23:01:20 httpx INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-04-04 23:01:24 ht

In [22]:
metrics = ds.eval()

In [23]:
ds.df.to_csv(output_path, index=False)
ds.df.to_pickle(output_path.replace('.csv', '.pkl'))