In [19]:
import os
import json
from datasets import load_dataset
from together import Together
from dotenv import load_dotenv

In [20]:
MODEL = "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"
BASE_PROMPT_GENERATE_FILEPATH = "prompts/generate_split_sentences.txt"
BASE_PROMPT_FILTER_FILEPATH = "prompts/filter_split_sentences.txt.txt"

INPUT_DATASET = "vohuutridung/3190-data"
OUTPUT_FILE = "output/atoss_sft_dataset.jsonl"

load_dotenv()
# client = Together(api_key=os.getenv("TOGETHER_API_KEY"))
client = Together()

In [21]:
def generate_splits(sentence, aspects):

    with open(BASE_PROMPT_GENERATE_FILEPATH, "r", encoding="utf-8") as f:
        prompt_base = f.read()

    prompt = f"""
{prompt_base}

ORIGINAL SENTENCE:
{sentence}

ASPECT TERMS:
{aspects}
"""

    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        temperature=1.0
    )

    output = response.choices[0].message["content"]

    try:
        candidates = json.loads(output)
    except Exception:
        print("Output is not json")
        print(output)
        return []

    return candidates

In [22]:
def filter_split(sentence, aspects, candidates, K=2):

    with open(BASE_PROMPT_FILTER_FILEPATH, "r", encoding="utf-8") as f:
        prompt_base = f.read()

    prompt = f"""
{prompt_base}

ORIGINAL SENTENCE:
{sentence}

ASPECT TERMS:
{aspects}

CANDIDATES SPLITS:
{json.dumps(candidates, indent=2)}
"""

    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        temperature=0.0
    )

    output = response.choices[0].message["content"]

    try:
        filtered = json.loads(output)
    except Exception:
        print("Output is not json")
        print(output)
        return []

    return filtered[:K]

In [23]:
def build_dataset():

    ds = load_dataset(INPUT_DATASET, split="train")
    len_ds = len(ds)
    count_process = 0

    fout = open(OUTPUT_FILE, "w", encoding="utf-8")

    for row in ds:

        count_process += 1

        sentence = row["text"]
        aspects = row["labels"]

        # Step 1: generate 10 s'
        candidates = generate_splits(sentence, aspects)
        if not candidates:
            print(f"This {sentence} cant be processed in generate_splits function")
            continue

        # Step 2: Select K s'
        best = filter_split(sentence, aspects, candidates)
        if not best:
            print(f"This {sentence} cant be processed in filter_split function")
            continue

        # Step 3: Export
        for s_output in best:
            fout.write(
                json.dumps(
                    {
                        "instruction": sentence,
                        "output": s_output,
                    },
                    ensure_ascii=False,
                )
                + "\n"
            )

        if count_process % 100 == 0:
            print(f"Processed {count_process}/{len_ds} sentences")


    fout.close()
    print("Process successfully")


In [24]:
build_dataset()

AuthenticationError: Error code: 401 - {"message": "Invalid API key provided. You can find your API key at https://api.together.ai/settings/api-keys.", "type_": "invalid_request_error", "code": "invalid_api_key"}