In [None]:
!pip install -q langchain_community langchain_groq langchain langchain-deepseek 

In [14]:
from langchain_groq import ChatGroq
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
import pandas as pd 
import os, sys
# from google.colab import files
from dotenv import load_dotenv
from langchain_deepseek import ChatDeepSeek
from datetime import datetime

In [3]:
load_dotenv()
DEEPSEEK_API_KEY = os.getenv('DEEPSEEK_API_KEY')
llm = ChatDeepSeek(
    model="deepseek-reasoner",
    # model="deepseek-chat",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    api_key = DEEPSEEK_API_KEY
)

In [10]:
def llm_labling(headline, description, ticker):

    prompt = PromptTemplate.from_template(
        """
        You are a professional analyst working in financial risk management. 
        You will be given a stock ticker per a news headline and the news description. 
        Analyze the news headline and news description solely based on the stock ticker given
        and take given stock ticker shareholders perspective to analyse. 
        Use the below ten risk categories to rate each risk from -10 to +10.
        
        -10 means Severe negative impact
        0 means No material impact
        10 means Significant positive impact
        
        Return an integer rating on each risk in json string format.
        Focus on material relevance toward the stock ticker shareholders.
        Do not add any descriptions or reasoning wordings in your answer.
        
        Risk Categories
        1)	Strategic Risk includes Industry, Competition, M&A, Executive Decisions, Business Development, Market Entry.
        2)	Operational Risk includes Process Failure or Improvement, Human Error, Accident, Delay, Vendor or Supply Chain, System Breakdown.
        3)	Financial Risk includes Equity Valuation, Leverage, Cash Flow, Capital Structure, Audit or Tax, Liquidity Stress.
        4)	Market Risk includes Sales, P&L, Interest Rate, Currency, Commodity, Volatility.
        5)	Technology Risk includes Innovation, R&D, Cybersecurity, Outage, Data Breach, Legacy Systems.
        6)	Credit Risk includes Credit Downgrade, Concentration, Bankruptcy, Counterparty, Default Probability, Collateral Risk.
        7)	Legal Risk includes Litigation, Regulatory Breach, Compliance, AML, KYC, Contract Risk.
        8)	Political Risk includes Geopolitical, Trade Sanctions, Instability, Country Risk, Expropriation, Policy Change.
        9)	Reputational Risk includes Marketing, Branding, Media, Public Relationship Crisis, Key Personnel Changes, Social Media Risk.
        10)	ESG Risk includes Climate Risk, Pollution Control, Labor Rights, Product Safety, Ethical Conduct, Board Oversight.
        
        \n\n{ticker}
        \n\n{headline}
        \n\n{description} 

        """
    )
    labeling_chain = (
        prompt
        |llm
        |StrOutputParser()
    )

    result = labeling_chain.invoke({'headline': headline, 'description':description, 'ticker':ticker})
    # analysis = result.split('/think>\n')[-1]
    # analysis = analysis.replace('\n', ' ')

    return result

In [5]:
out_root = './data/DSRONlabeledV2'
sc_root = './data/'

os.makedirs(out_root, exist_ok=True)

In [6]:
CSV_list = []
for file in os.listdir(sc_root):
    if not file.endswith('.csv'):
        continue

    CSV_list.append(file)

In [7]:
df = pd.read_csv(os.path.join(sc_root, file))
df.head()

Unnamed: 0,Datetime,headline,description,article,link
0,2025-06-26 23:44:00,"If Taiwan Semiconductor Is Near A Ceiling, Her...",Is Taiwan Semiconductor ready to take a breath...,Is Taiwan Semiconductor ready to take a breath...,https://finance.yahoo.com/m/0199bd52-694d-3088...
1,2025-06-26 23:05:00,"Want $1 Million in Retirement? Invest $100,000...","These companies are the best at what they do, ...",Nvidia figures to be a primary beneficiary of ...,https://finance.yahoo.com/news/want-1-million-...
2,2025-06-26 19:31:00,Taiwan Semiconductor Manufacturing Company,Taiwan Semiconductor Manufacturing Company (TS...,Credit - Courtesy Taiwan Semiconductor Manufac...,https://finance.yahoo.com/news/taiwan-semicond...
3,2025-06-26 17:00:00,"Trending tickers: Nvidia, Shell, Micron, Bumbl...",The latest investor updates on stocks that are...,Shares in chipmaker Nvidia (NVDA) rose more th...,https://finance.yahoo.com/news/nvidia-shell-mi...
4,2025-06-26 17:45:00,TSMC Just Fired a $10 Billion Warning Shot at ...,The chip giant's boldest forex move yet signal...,TSMC (NYSE:TSM) just announced a bold $10 bill...,https://finance.yahoo.com/news/tsmc-just-fired...


In [8]:
news_id = 4
ticker_id = 5

ticker = CSV_list[ticker_id].split('.')[0].split('_news_')[-1]
df = pd.read_csv(os.path.join(sc_root, CSV_list[ticker_id]))
headline = df['headline'].iloc[news_id]
description = df['description'].iloc[news_id]
ticker, headline, description

('COIN',
 'Equity Markets Close Higher as Trump Weighs Early Fed Chair Change',
 'US benchmark equity indexes closed higher on Thursday, following media reports that President Donald')

In [11]:
response = llm_labling(headline, description, ticker)
response

'```json\n{\n  "Strategic Risk": 0,\n  "Operational Risk": 0,\n  "Financial Risk": 0,\n  "Market Risk": 1,\n  "Technology Risk": 0,\n  "Credit Risk": 0,\n  "Legal Risk": 0,\n  "Political Risk": 0,\n  "Reputational Risk": 0,\n  "ESG Risk": 0\n}\n```'

In [41]:
response = llm_labling(headline, description, ticker)
response

'```json\n{\n  "Market Risk": 0,\n  "Financial Risk": 0,\n  "Credit Risk": 0,\n  "Regulatory & Compliance Risk": 0,\n  "Political Risk": -2,\n  "Technology Risk": 0,\n  "Environmental Risk": 0,\n  "Operational Risk": -5,\n  "Reputational Risk": -8\n}\n```'

In [43]:
response = llm_labling(headline, description, ticker)
response

'```json\n{\n  "Market Risk": {\n    "Interest Rate": -2,\n    "Currency": 0,\n    "Commodity": 0,\n    "Volatility": -1,\n    "Liquidity": 0\n  },\n  "Financial Risk": {\n    "Equity Valuation": 0,\n    "Leverage": 0,\n    "Cash Flow": 0,\n    "Capital Structure": 0\n  },\n  "Credit Risk": {\n    "Credit Downgrade": 0,\n    "Concentration": 0,\n    "Bankruptcy": 0,\n    "Counterparty": 0\n  },\n  "Regulatory & Compliance Risk": {\n    "Legal": 0,\n    "AML": 0,\n    "Regulatory Breach": 0,\n    "KYC": 0\n  },\n  "Political Risk": {\n    "Geopolitical": 0,\n    "Trade Sanctions": 0,\n    "Instability": 0\n  },\n  "Technology Risk": {\n    "Cybersecurity": 0,\n    "Outage": 0,\n    "Data Breach": 0\n  },\n  "Environmental Risk": {\n    "Climate": 0,\n    "Carbon Regulation": 0,\n    "Natural Disasters": 0\n  },\n  "Operational Risk": {\n    "Process Failure": 0,\n    "Human Error": 0,\n    "Vendor": 0\n  },\n  "Strategic Risk": {\n    "Industry": 0,\n    "Competition": 0,\n    "M&A": 0,

In [16]:
tot_time.total_seconds()

-10.00163

In [None]:
for file in os.listdir(sc_root):
    if not file.endswith('.csv'):
        continue

    if os.path.isfile(os.path.join(out_root, file)):
        print(f"{file} - skipped")
        print("\n----------------------------------------------------------\n")
        continue

    df = pd.read_csv(os.path.join(sc_root, file))

    analysis = []
    print_str = ' '
    print(f"{file} - total news: {len(df)}")
    ticker = file.split('.')[0].split('news_')[-1]
    st_time = datetime.now()
    for i, (headline, description) in enumerate(zip(df['headline'], df['description'])):
        sys.stdout.flush()
        print_str = '\r' + ''.join([' ' for i in range(len(print_str))])
        sys.stdout.write(print_str)

        sys.stdout.flush()
        print_str = f'\r{i} processing: {headline}'
        sys.stdout.write(print_str)

        response = llm_labling(headline, description, ticker)

        analysis.append(response)
    end_time = datetime.now()
    print(f"\n
    total time taken {(end_time - st_time).total_seconds() / 60} min")
    df['responses'] = analysis
    df.to_csv(os.path.join(out_root, file))
    
    print("\n----------------------------------------------------------\n")

yf_news_AAPL.csv - skipped

----------------------------------------------------------

yf_news_AMZN.csv - total news: 198
total time taken 136.94926070000002 minto enhance its India infrastructure                                                                                                   

----------------------------------------------------------

yf_news_BA.csv - total news: 200
total time taken 132.40044963333332 minomers, Innovation and Partnership at Paris Air Show 2025                                                                                       

----------------------------------------------------------

yf_news_BABA.csv - total news: 199
85 processing: Market Chatter: Apple, Alibaba AI Rollout in China Delayed by Regulators Amid Rising US-China Tensions                                                 

In [None]:
df = pd.read_csv("./data/DSRONlabeledV2/yf_news_AAPL.csv", index_col=0, encoding='latin1')

In [26]:
json_str = df['responses'][0]
json_str

'Market Risk, "Financial Risk, "Credit Risk, "Regulatory & Compliance Risk, "Political Risk, "Technology Risk, "Environmental Risk, "Operational Risk, "Strategic Risk, "Reputational'

In [None]:
json_str = json_str.replace("```", "")
json_str = json_str.replace("\n", "")
json_str = json_str.replace("json", "")
# eval_str = 'risk_dict = ' + json_str
risk_dict = eval(json_str)
risk_dict

In [1]:
def jsonstr2dict(json_str):
    json_str = json_str.replace("```", "")
    json_str = json_str.replace("\n", "")
    json_str = json_str.replace("json", "")

    return eval(json_str)

def dict2ans(rank_dict):
    item_list = list(rank_dict.items())
    ans_str = ""
    for risk, rank in item_list:
        ans_str += f"{risk}: {rank}\n"

    return ans_str

def jsonstr2ans(json_str):
    ans_dict = jsonstr2dict(json_str)
    return dict2ans(ans_dict)

In [2]:
risk_answers = df['responses'].apply(jsonstr2ans)

NameError: name 'df' is not defined