In [2]:
import os
import transformers
import torch
from datasets import Dataset
from trl import SFTTrainer
from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, GemmaTokenizer
from dotenv import load_dotenv
import pandas as pd
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


## **Prepare dataset**

In [5]:
def jsonstr2dict(json_str):
    json_str = json_str.replace("```", "")
    json_str = json_str.replace("\n", "")
    json_str = json_str.replace("json", "")

    return eval(json_str)

def dict2ans(rank_dict):
    item_list = list(rank_dict.items())
    ans_str = ""
    for risk, rank in item_list:
        ans_str += f"{risk}: {rank}\n"

    return ans_str

def jsonstr2ans(json_str):
    ans_dict = jsonstr2dict(json_str)
    return dict2ans(ans_dict)

In [29]:
csv_folder = './data/DSRONlabeledV2/'
df_train = pd.DataFrame()
df_test = pd.DataFrame()
for file in os.listdir(csv_folder):
    df = pd.read_csv(os.path.join(csv_folder, file), index_col=0, encoding='unicode_escape')
    answers = df['responses'].apply(jsonstr2ans)

    ticker = file.split('.')[0].split('_news_')[-1]
    df['ticker'] = ticker

    X_train, X_test, y_train, y_test = train_test_split(df[['ticker', 'headline', 'description']], answers, test_size=0.2, random_state=42)

    df_tmp_train = pd.DataFrame(X_train)
    df_tmp_train['answer'] = y_train

    df_tmp_test = pd.DataFrame(X_test)
    df_tmp_test['answer'] = y_test

    df_train = pd.concat([df_train, df_tmp_train])
    df_test = pd.concat([df_test, df_tmp_test])

In [30]:
df_train.to_csv('./rank_train.csv', index=False)
df_test.to_csv('./rank_test.csv', index=False)

In [31]:
df_train.head(3)

Unnamed: 0,ticker,headline,description,answer
137,AAPL,Alibabas (BABA) New Qwen3 AI Models Now Compa...,Alibaba Group Holding Limited (NYSE:BABA) is o...,Strategic Risk: 3\nOperational Risk: 0\nFinanc...
5,AAPL,Apple updates the rules for its EU App Store b...,Apple on Thursday announced a series of update...,Strategic Risk: -3\nOperational Risk: -2\nFina...
66,AAPL,HSBC Just Flipped Bullish on Broadcom (AVGO)H...,Broadcom Inc. (NASDAQ:AVGO) is one of the 10 T...,Strategic Risk: 7\nOperational Risk: 0\nFinanc...


In [32]:
df_test.head(3)

Unnamed: 0,ticker,headline,description,answer
65,AAPL,Apples (AAPL) Next Big AI Move? Analyst Sees ...,Apple Inc. (NASDAQ:AAPL) is one of the 10 Tren...,Strategic Risk: 10\nOperational Risk: 0\nFinan...
114,AAPL,Nvidia: How the chipmaker evolved from a gamin...,Heres a look at Nvidias path to where it is ...,Strategic Risk: -7\nOperational Risk: 0\nFinan...
16,AAPL,Apple (AAPL) Staring at 500 Fine in EU over A...,Apple Inc. (NASDAQ:AAPL) is one of the 12 best...,Strategic Risk: -3\nOperational Risk: -1\nFina...


## **Model Loading and Influencing**

In [3]:
df_train = pd.read_csv('./rank_train.csv')
df_train.head(3)

Unnamed: 0,ticker,headline,description,answer
0,AAPL,Alibabas (BABA) New Qwen3 AI Models Now Compa...,Alibaba Group Holding Limited (NYSE:BABA) is o...,Strategic Risk: 3\nOperational Risk: 0\nFinanc...
1,AAPL,Apple updates the rules for its EU App Store b...,Apple on Thursday announced a series of update...,Strategic Risk: -3\nOperational Risk: -2\nFina...
2,AAPL,HSBC Just Flipped Bullish on Broadcom (AVGO)H...,Broadcom Inc. (NASDAQ:AVGO) is one of the 10 T...,Strategic Risk: 7\nOperational Risk: 0\nFinanc...


In [4]:
df_test = pd.read_csv('./rank_train.csv')
df_test.head(3)

Unnamed: 0,ticker,headline,description,answer
0,AAPL,Alibabas (BABA) New Qwen3 AI Models Now Compa...,Alibaba Group Holding Limited (NYSE:BABA) is o...,Strategic Risk: 3\nOperational Risk: 0\nFinanc...
1,AAPL,Apple updates the rules for its EU App Store b...,Apple on Thursday announced a series of update...,Strategic Risk: -3\nOperational Risk: -2\nFina...
2,AAPL,HSBC Just Flipped Bullish on Broadcom (AVGO)H...,Broadcom Inc. (NASDAQ:AVGO) is one of the 10 T...,Strategic Risk: 7\nOperational Risk: 0\nFinanc...


In [5]:
train_dataset = Dataset.from_pandas(df_train, preserve_index=False)
test_dataset = Dataset.from_pandas(df_test, preserve_index=False)
train_dataset, test_dataset

(Dataset({
     features: ['ticker', 'headline', 'description', 'answer'],
     num_rows: 2630
 }),
 Dataset({
     features: ['ticker', 'headline', 'description', 'answer'],
     num_rows: 2630
 }))

In [6]:
load_dotenv()
HF_TOKEN = os.environ["HF_TOKEN"]

In [7]:
model_id = "google/gemma-2b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'], trust_remote_code=True, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map="auto",
                                             token=os.environ['HF_TOKEN'])
# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", token=os.environ['HF_TOKEN'])

<transformers.integrations.tensor_parallel.ParallelInterface object at 0x0000019CC343BF10>
<transformers.integrations.tensor_parallel.ParallelInterface object at 0x0000019CC343BF10>
<transformers.integrations.tensor_parallel.ParallelInterface object at 0x0000019CC343BF10>
<transformers.integrations.tensor_parallel.ParallelInterface object at 0x0000019CC343BF10>
<transformers.integrations.tensor_parallel.ParallelInterface object at 0x0000019CC343BF10>
<transformers.integrations.tensor_parallel.ParallelInterface object at 0x0000019CC343BF10>
<transformers.integrations.tensor_parallel.ParallelInterface object at 0x0000019CC343BF10>
<transformers.integrations.tensor_parallel.ParallelInterface object at 0x0000019CC343BF10>
<transformers.integrations.tensor_parallel.ParallelInterface object at 0x0000019CC343BF10>
<transformers.integrations.tensor_parallel.ParallelInterface object at 0x0000019CC343BF10>
<transformers.integrations.tensor_parallel.ParallelInterface object at 0x0000019CC343BF10>

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 2/2 [00:05<00:00,  2.94s/it]


In [13]:
train_prompt_style = """
You are a professional analyst working in financial risk management. 
You will be given a stock ticker per a news headline and the news description. 
Analyze the news headline and news description solely based on the stock ticker given
and take given stock ticker shareholders perspective to analyse. 
Use the below ten risk categories to rate each risk from -10 to +10.

-10 means Severe negative impact
0 means No material impact
10 means Significant positive impact

Return an integer rating on each risk in json string format.
Focus on material relevance toward the stock ticker shareholders.
Do not add any descriptions or reasoning wordings in your answer.
Write the answer in between <answer></answer>

Risk Categories
1)	Strategic Risk includes Industry, Competition, M&A, Executive Decisions, Business Development, Market Entry.
2)	Operational Risk includes Process Failure or Improvement, Human Error, Accident, Delay, Vendor or Supply Chain, System Breakdown.
3)	Financial Risk includes Equity Valuation, Leverage, Cash Flow, Capital Structure, Audit or Tax, Liquidity Stress.
4)	Market Risk includes Sales, P&L, Interest Rate, Currency, Commodity, Volatility.
5)	Technology Risk includes Innovation, R&D, Cybersecurity, Outage, Data Breach, Legacy Systems.
6)	Credit Risk includes Credit Downgrade, Concentration, Bankruptcy, Counterparty, Default Probability, Collateral Risk.
7)	Legal Risk includes Litigation, Regulatory Breach, Compliance, AML, KYC, Contract Risk.
8)	Political Risk includes Geopolitical, Trade Sanctions, Instability, Country Risk, Expropriation, Policy Change.
9)	Reputational Risk includes Marketing, Branding, Media, Public Relationship Crisis, Key Personnel Changes, Social Media Risk.
10)	ESG Risk includes Climate Risk, Pollution Control, Labor Rights, Product Safety, Ethical Conduct, Board Oversight.
### Ticker:
{}

### Headline:
{}

### Description:
{}

### Response:
{}
"""

In [14]:
EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    tickers = examples["ticker"]
    headlines = examples["headline"]
    descriptions = examples["description"]
    responses = examples["answer"]
    texts = []
    for ticker, headline, description, response in zip(tickers, headlines, descriptions, responses):
        
        # Append the EOS token to the response if it's not already there
        response = "<answer>\n\n" + response + "\n</answer>\n"
        response += tokenizer.eos_token
            
        text = train_prompt_style.format(ticker, headline, description, response)
        texts.append(text)
    return {"text": texts}

In [15]:
train_dataset = train_dataset.map(
    formatting_prompts_func,
    batched=True,
)
print(train_dataset["text"][0])

Map: 100%|███████████████████████████████████████████████████████████████| 2630/2630 [00:00<00:00, 31360.04 examples/s]


You are a professional analyst working in financial risk management. 
You will be given a stock ticker per a news headline and the news description. 
Analyze the news headline and news description solely based on the stock ticker given
and take given stock ticker shareholders perspective to analyse. 
Use the below ten risk categories to rate each risk from -10 to +10.

-10 means Severe negative impact
0 means No material impact
10 means Significant positive impact

Return an integer rating on each risk in json string format.
Focus on material relevance toward the stock ticker shareholders.
Do not add any descriptions or reasoning wordings in your answer.
Write the answer in between <answer></answer>

Risk Categories
1)	Strategic Risk includes Industry, Competition, M&A, Executive Decisions, Business Development, Market Entry.
2)	Operational Risk includes Process Failure or Improvement, Human Error, Accident, Delay, Vendor or Supply Chain, System Breakdown.
3)	Financial Risk includes E




In [32]:
test_dataset = test_dataset.map(
    formatting_prompts_func,
    batched=True,
)
print(test_dataset["text"][0])

Map: 100%|███████████████████████████████████████████████████████████████| 2630/2630 [00:00<00:00, 51789.80 examples/s]


You are a professional analyst working in financial risk management. 
You will be given a stock ticker per a news headline and the news description. 
Analyze the news headline and news description solely based on the stock ticker given
and take given stock ticker shareholders perspective to analyse. 
Use the below ten risk categories to rate each risk from -10 to +10.

-10 means Severe negative impact
0 means No material impact
10 means Significant positive impact

Return an integer rating on each risk in json string format.
Focus on material relevance toward the stock ticker shareholders.
Do not add any descriptions or reasoning wordings in your answer.
Write the answer in between <answer></answer>

Risk Categories
1)	Strategic Risk includes Industry, Competition, M&A, Executive Decisions, Business Development, Market Entry.
2)	Operational Risk includes Process Failure or Improvement, Human Error, Accident, Delay, Vendor or Supply Chain, System Breakdown.
3)	Financial Risk includes E




In [16]:
inference_prompt_style = """
You are a professional analyst working in financial risk management. 
You will be given a stock ticker per a news headline and the news description. 
Analyze the news headline and news description solely based on the stock ticker given
and take given stock ticker shareholders perspective to analyse. 
Use the below ten risk categories to rate each risk from -10 to +10.

-10 means Severe negative impact
0 means No material impact
10 means Significant positive impact

Return an integer rating on each risk in json string format.
Focus on material relevance toward the stock ticker shareholders.
Do not add any descriptions or reasoning wordings in your answer.
Write the answer in between <answer></answer>

Risk Categories
1)	Strategic Risk includes Industry, Competition, M&A, Executive Decisions, Business Development, Market Entry.
2)	Operational Risk includes Process Failure or Improvement, Human Error, Accident, Delay, Vendor or Supply Chain, System Breakdown.
3)	Financial Risk includes Equity Valuation, Leverage, Cash Flow, Capital Structure, Audit or Tax, Liquidity Stress.
4)	Market Risk includes Sales, P&L, Interest Rate, Currency, Commodity, Volatility.
5)	Technology Risk includes Innovation, R&D, Cybersecurity, Outage, Data Breach, Legacy Systems.
6)	Credit Risk includes Credit Downgrade, Concentration, Bankruptcy, Counterparty, Default Probability, Collateral Risk.
7)	Legal Risk includes Litigation, Regulatory Breach, Compliance, AML, KYC, Contract Risk.
8)	Political Risk includes Geopolitical, Trade Sanctions, Instability, Country Risk, Expropriation, Policy Change.
9)	Reputational Risk includes Marketing, Branding, Media, Public Relationship Crisis, Key Personnel Changes, Social Media Risk.
10)	ESG Risk includes Climate Risk, Pollution Control, Labor Rights, Product Safety, Ethical Conduct, Board Oversight.
### Ticker:
{}

### Headline:
{}

### Description:
{}

### Response:
<answer>
"""

In [18]:
inf_id = 120

ticker = df_train['ticker'].iloc[inf_id]
headline = df_train['headline'].iloc[inf_id]
description = df_train['description'].iloc[inf_id]

inference_prompt_style.format(ticker, headline, description)

'\nYou are a professional analyst working in financial risk management. \nYou will be given a stock ticker per a news headline and the news description. \nAnalyze the news headline and news description solely based on the stock ticker given\nand take given stock ticker shareholders perspective to analyse. \nUse the below ten risk categories to rate each risk from -10 to +10.\n\n-10 means Severe negative impact\n0 means No material impact\n10 means Significant positive impact\n\nReturn an integer rating on each risk in json string format.\nFocus on material relevance toward the stock ticker shareholders.\nDo not add any descriptions or reasoning wordings in your answer.\nWrite the answer in between <answer></answer>\n\nRisk Categories\n1)\tStrategic Risk includes Industry, Competition, M&A, Executive Decisions, Business Development, Market Entry.\n2)\tOperational Risk includes Process Failure or Improvement, Human Error, Accident, Delay, Vendor or Supply Chain, System Breakdown.\n3)\tFi

In [19]:
inputs = tokenizer(
    inference_prompt_style.format(ticker, headline, description),
    return_tensors="pt"
).to(model.device)

outputs = model.generate(
    **inputs,
    max_new_tokens=120,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id  # if tokenizer has no pad token
)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)
result

'\nYou are a professional analyst working in financial risk management. \nYou will be given a stock ticker per a news headline and the news description. \nAnalyze the news headline and news description solely based on the stock ticker given\nand take given stock ticker shareholders perspective to analyse. \nUse the below ten risk categories to rate each risk from -10 to +10.\n\n-10 means Severe negative impact\n0 means No material impact\n10 means Significant positive impact\n\nReturn an integer rating on each risk in json string format.\nFocus on material relevance toward the stock ticker shareholders.\nDo not add any descriptions or reasoning wordings in your answer.\nWrite the answer in between <answer></answer>\n\nRisk Categories\n1)\tStrategic Risk includes Industry, Competition, M&A, Executive Decisions, Business Development, Market Entry.\n2)\tOperational Risk includes Process Failure or Improvement, Human Error, Accident, Delay, Vendor or Supply Chain, System Breakdown.\n3)\tFi

## **Training**

In [20]:
from trl import DataCollatorForCompletionOnlyLM

response_template = "<answer>"
data_collator = DataCollatorForCompletionOnlyLM(
    tokenizer=tokenizer,
    response_template=response_template
)

In [21]:
from trl import SFTTrainer
from transformers import TrainingArguments

# Training Arguments
training_arguments = TrainingArguments(
    output_dir="NewsRiskRanking",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=20,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=5e-5,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="none",
    label_names=["labels"],  # add this
)

In [22]:
from peft import LoraConfig, get_peft_model

# LoRA config
peft_config = LoraConfig(
    lora_alpha=16,                           # Scaling factor for LoRA
    lora_dropout=0.05,                       # Add slight dropout for regularization
    r=64,                                    # Rank of the LoRA update matrices
    bias="none",                             # No bias reparameterization
    task_type="CAUSAL_LM",                   # Task type: Causal Language Modeling
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],  # Target modules for LoRA
)

model = get_peft_model(model, peft_config)

In [33]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=peft_config,
    data_collator=data_collator,
)

Converting train dataset to ChatML: 100%|████████████████████████████████| 2630/2630 [00:00<00:00, 12244.19 examples/s]
Adding EOS to train dataset: 100%|████████████████████████████████████████| 2630/2630 [00:00<00:00, 7469.15 examples/s]
Tokenizing train dataset: 100%|████████████████████████████████████████████| 2630/2630 [00:03<00:00, 752.70 examples/s]
Truncating train dataset: 100%|█████████████████████████████████████████| 2630/2630 [00:00<00:00, 175989.46 examples/s]
Converting eval dataset to ChatML: 100%|█████████████████████████████████| 2630/2630 [00:00<00:00, 10121.54 examples/s]
Adding EOS to eval dataset: 100%|█████████████████████████████████████████| 2630/2630 [00:00<00:00, 9059.53 examples/s]
Tokenizing eval dataset: 100%|█████████████████████████████████████████████| 2630/2630 [00:03<00:00, 724.47 examples/s]
Truncating eval dataset: 100%|██████████████████████████████████████████| 2630/2630 [00:00<00:00, 189683.08 examples/s]


In [None]:
import gc, torch
gc.collect()
torch.cuda.empty_cache()
model.config.use_cache = False
trainer.train()

Step,Training Loss
1,0.7409
2,0.7251
3,0.5293
4,0.4966
5,0.5823
6,0.485
7,0.5849
8,0.4641
9,0.4743
10,0.4924
