In [1]:
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# TFNS

In [None]:
import warnings
warnings.filterwarnings("ignore")
import json

from datasets import load_dataset

dic = {
    0:"negative",
    1:'positive',
    2:'neutral',
}

def format_example(example: dict) -> dict:
    context = f"Instruction: {example['instruction']}\n"
    if example.get("input"):
        context += f"Input: {example['input']}\n"
    context += "Answer: "
    target = example["output"]
    return {"context": context, "target": target}

def change_target(x):
    if 'positive' in x or 'Positive' in x:
        return 'positive'
    elif 'negative' in x or 'Negative' in x:
        return 'negative'
    else:
        return 'neutral'

def get_data_tfns(batch_size = 8, prompt_fun = None, target='train'):
    dataset = load_dataset('zeroshot/twitter-financial-news-sentiment')
    # dataset = dataset['validation']
    dataset = dataset[target]
    dataset = dataset.to_pandas()
    dataset['label'] = dataset['label'].apply(lambda x:dic[x])

    if prompt_fun is None:
        dataset["instruction"] = 'What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}.'
    else:
        dataset["instruction"] = dataset.apply(prompt_fun, axis = 1)

    dataset.columns = ['input', 'output', 'instruction']
    dataset[["context","target"]] = dataset.apply(format_example, axis = 1, result_type="expand")

    # print example
    print(f"\n\nPrompt example:\n{dataset['context'][0]}\n\n")

    context = dataset['context'].tolist()

    total_steps = dataset.shape[0]//batch_size + 1
    print(f"Total len: {len(context)}. Batchsize: {batch_size}. Total steps: {total_steps}")

    return dataset

def dataset_to_json(dataset, output_file):
    """
    将 Pandas DataFrame 转换为 JSON 格式并保存为文件。
    每一行会转换为 {"instruction": "", "input": "", "output": ""} 的字典。
    """
    # 构造 JSON 数据
    json_data = dataset.apply(lambda row: {
        "instruction": row["instruction"],
        "input": row["input"],
        "output": row["output"]
    }, axis=1).tolist()

    # 将 JSON 数据写入文件
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(json_data, f, ensure_ascii=False, indent=4)

    print(f"JSON 文件已保存至 {output_file}")

targets = ['train', 'validation']

for target in targets:
    data_class = 'tfns'
    output_file = f'data/finance_sentiment/{data_class}_{target}.json'
    tfns_dataset = get_data_tfns()
    tfns_dataset.head().to_csv(f'./lab/sentiment/tfns_demo_{target}.csv')
    dataset_to_json(tfns_dataset, output_file)