In [1]:
import os
import textwrap
from IPython.display import display
from IPython.display import Markdown
from datetime import datetime
import numpy_financial as npf
import math
import pandas as pd
import numpy as np


In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

HF_TOKEN = "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"
base_model_id = "meta-llama/Meta-Llama-3-8B"
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
)


# Re-init the tokenizer so it doesn't add padding or eos token
eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token=True,
)

model = AutoModelForCausalLM.from_pretrained(base_model_id)#, quantization_config=bnb_config)




  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [01:35<00:00, 23.84s/it]


In [3]:
from transformers import set_seed

set_seed(627)
def lm_gen(eval_prompt):
    model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")
    with torch.inference_mode():
        
        
        output = model.generate(**model_input, pad_token_id=eval_tokenizer.eos_token_id, max_new_tokens=256, do_sample=True)
        content = eval_tokenizer.decode(output[0], skip_special_tokens=True)
        # print(eval_tokenizer.decode(output[0], skip_special_tokens=True))
    result_string  = content.replace(eval_prompt, "")
    return result_string

In [4]:
# eval_prompt = "請依據深入的公司創新分析及其對預期市場影響的評估，表明股價持續上漲，反映了投資者對公司令人振奮的長期增長前景的樂觀情緒。給這篇文章打一個分數，分數介於-100～100之間，-100是負向，100是最好，0則是無法判斷'第12款符合條款第四條第XX款：12事實發生日：112/11/231.召開法人說明會之日期：112/11/232.召開法人說明會之時間：14 時 00 分3.召開法人說明會之地點：統一證券集團大樓15F會議室(台北市松山區東興路8號15F)4.法人說明會擇要訊息：本公司受邀參加統一綜合證券舉辦之法人說明會，會中說明本公司之財務及營運概況。5.其他應敘明事項：無完整財務業務資訊請至公開資訊觀測站之法人說明會一覽表或法說會項目下查閱。"
eval_prompt = """請對於以下資訊回答 #非常好 #好 #無關 #不好 #非常不好'
### Target sentence:
第12款符合條款第四條第XX款：12事實發生日：112/11/231.召開法人說明會之日期：112/11/232.召開法人說明會之時間：14 時 00 分3.召開法人說明會之地點：統一證券集團大樓15F會議室(台北市松山區東興路8號15F)4.法人說明會擇要訊息：本公司受邀參加統一綜合證券舉辦之法人說明會，會中說明本公司之財務及營運概況。5.其他應敘明事項：無完整財務業務資訊請至公開資訊觀測站之法人說明會一覽表或法說會項目下查閱。"

### Response:
"""

In [5]:
system = "Please answer the following information from TSMC #very good #good #irrelevant #bad #very bad"
content = "TSMC has been advancing in chip technology and recently announced a series of new technologies, including the 1.6nm 'A16' process, which has the opportunity to be used in future mobile phone and computer chips."

messages = [
    {
        "role": "system",
        "content": system,
    },
    {"role": "user", "content": content},
 ]
tokenized_chat  = eval_tokenizer.apply_chat_template(messages, tokenize=False)

print(tokenized_chat)



No chat template is defined for this tokenizer - using a default chat template that implements the ChatML format (without BOS/EOS tokens!). If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.



<|im_start|>system
Please answer the following information from TSMC #very good #good #irrelevant #bad #very bad<|im_end|>
<|im_start|>user
TSMC has been advancing in chip technology and recently announced a series of new technologies, including the 1.6nm 'A16' process, which has the opportunity to be used in future mobile phone and computer chips.<|im_end|>



In [6]:

print(lm_gen(tokenized_chat))



RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:
model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(eval_tokenizer.decode(model.generate(**model_input, max_new_tokens=256)[0], skip_special_tokens=True))

# ft_model = PeftModel.from_pretrained(base_model, "mistral-stockLM-finetune/checkpoint-1000")

In [None]:
def generate_prompt(t):
    full_prompt =f"""請給這篇文章打一個分數，分數介於-100～100之間，-100是負向，100是最好，0則是無法判斷

    ### Target news:
    {t}

    ### Evaluate:
    """
    return full_prompt

In [None]:

start_time = datetime(2023, 1, 1)
end_time = datetime(2023, 12, 31)

# stock_ids = ["1101", "2211", "2385", "2542", "2880", "2912", "3023", "3264", "5269", "8027"]
# stock_ids = ["2912", "3023", "3264", "5269", "8027"]
stock_ids = ["2912"]

# 變異
irrs = []
bnhs = []
raw_data = []
for stock_id in stock_ids:
    print(f'執行{stock_id}')
    folder_path = "stock_news/" + stock_id + "/" + stock_id + "_news/"
    files = os.listdir(folder_path)
    sorted_files = sorted(files)

    # 載入買進價格
    df = pd.read_csv("price_history/" + stock_id + ".csv", dtype=str)
    price_np = df.to_numpy()
    prices = price_np.tolist()  # [時間,價格]

    signal_2 = []

    for price in prices:
        # 檢查日期是否在參考日期之後
        try:
            date_obj = datetime.strptime(price[0], '%Y%m%d')
        except:
            continue

        if not (start_time <= date_obj <= end_time):
            continue
        
        sig = 0
        for index, file_name in enumerate(sorted_files):
            if file_name == price[0]:
                with open(folder_path + file_name, encoding='utf-8') as f:
                    news = f.read()
                    # lm的回應，回傳帶有買入訊號
                    sig = lm_gen(generate_prompt(news))
                break

        signal_2.append([price[0], sig, price[1]])
        print(sig, "sd")


#     signal_2_np = np.array(signal_2)
#     # 回傳 IRR 以及總額投入
#     irr, bnh = calculate_irr(signal_2_np, 0)
#     irrs.append(irr)
#     bnhs.append(bnh)
#     raw_data.append(signal_2_np.tolist())
#     # print("每月內部報酬率", irr)
#     # print("總額投入報酬率", bnh)

# # 輸出結果
# write_folder = "evaluate_output/output"
# count = 1
# while True:
#     if os.path.exists(write_folder + str(count)):
#         count += 1
#         continue
#     else:
#         with open(write_folder + str(count), "a", encoding="UTF-8") as f:
#             content = (f'提示：{p_mutate_2}\n mutate from {p_mutate}\n'
#                         f'開始日期 {start_time.strftime("%Y%m%d")},'
#                         f'結束日期 {end_time.strftime("%Y%m%d")}\n')
#             for j in range(len(stock_ids)):
#                 content += (f'股票代號：{stock_ids[j]}\t'
#                             f'每月內部報酬率:{irrs[j]}\t'
#                             f'總額投入報酬率:{bnhs[j]}\n')
#             content += f'變異{i + 1}次，平均內部報酬率:{sum(irrs) / len(irrs)}，平均總額投入報酬率:{sum(bnhs) / len(bnhs)}\n)'
#             content += f'原始數據{raw_data}'
#             f.write(content)
#         print("寫入" + write_folder + str(count))
#         break
