In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, confusion_matrix
from tqdm import tqdm

In [2]:
news_data=pd.read_csv('news_data.csv',encoding='utf-8',index_col=0)

In [3]:
news_data.head(3)

Unnamed: 0,뉴스 식별자,일자,키워드,특성추출(가중치순 상위 50개),본문
0,4104008.0,20100331,"경질,최중경,경제,참여,연대,MB,언급,최중경,실책,이명박,대통령,최중경,주필리핀,...","참여연대,이명박,최중경,강만수,한국은행,경제정책,고환율,내정자,김중수,경제수석,재임...",이명박 대통령이 30일 최중경 주필리핀 대사를 청와대 경제수석으로 내정한 것에 대해...
1,4104008.0,20100331,"1인,금융,부채,처분소득,대비,143%,생활고통지수,9년,최악,가계,금융기관,개인부...","1인,생활경제고통지수,금융부채,가처분소득,개인부채,울산,1754만,경북,국민총소득,...",금융기관에 대한 개인부채가 1인당 1754만 원으로 1인당 국민총소득의 80%를 넘...
2,4100958.0,20100331,"수출경기,경남,수출,경기,개선,전망,경남,지역,수출,기업,수출,경기,개선,전망,한국...","경남,무역협회,ebsi,호조세,회원사,원재료,수출산업경기전망,수출국,한국무역협회,1...",경남지역 수출기업들은 2분기 수출경기가 크게 개선될 것으로 전망했다.\n\n한국무역...


In [4]:
ex_rate=pd.read_csv('./data/원화의 대미달러, 원화의 대위안_대엔 환율_10093253.csv')[['날짜','원/달러(종가 15:30)']]

In [5]:
ex_rate.head(3)

Unnamed: 0,날짜,원/달러(종가 15:30)
0,2001-01-02,1276.4
1,2001-01-03,1270.1
2,2001-01-04,1255.0


In [6]:
news_data['날짜']=pd.to_datetime(news_data['일자'],format="%Y%m%d")
ex_rate['날짜']=pd.to_datetime(ex_rate['날짜'])

In [7]:
news_data['키워드']=news_data['키워드'].fillna('').apply(lambda x: x.replace(',',' '))
daily_news=news_data.groupby(['일자'])['키워드'].apply(lambda x: ' '.join(x)).reset_index()

In [8]:
ex_rate['원/달러(종가 15:30)']=ex_rate['원/달러(종가 15:30)'].str.replace(',','').astype(float)

In [9]:
tokenizer=AutoTokenizer.from_pretrained('snunlp/KR-FinBert-SC')
model=AutoModelForSequenceClassification.from_pretrained('snunlp/KR-FinBert-SC')

In [10]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(20000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [11]:
def predict_sentiment(text):
    if len(text)>300:
        text=text[:300]
    tokens=tokenizer(text,return_tensors='pt',truncation=True,padding=True)
    with torch.no_grad():
        outputs=model(**tokens)
        probs=torch.nn.functional.softmax(outputs.logits,dim=1)
        pred=torch.argmax(probs,dim=1).item()
    return pred

In [12]:
news_data['sentiment']=news_data['키워드'].apply(predict_sentiment)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [76]:
news_data.to_csv('newswise_sentiment.csv',index=False)

In [13]:
sentiment_daily=news_data.groupby(['날짜'])['sentiment'].mean().reset_index()

In [58]:
merged_data=pd.merge(sentiment_daily,ex_rate,on='날짜')
merged_data['변화율']=merged_data['원/달러(종가 15:30)'].pct_change().shift(-1)
merged_data['상승']=(merged_data['변화율']>0).astype(int)

  merged_data['변화율']=merged_data['원/달러(종가 15:30)'].pct_change().shift(-1)


In [59]:
x=merged_data[['sentiment']].values
y=merged_data['상승'].values

In [63]:
from sklearn.preprocessing import MinMaxScaler

mms=MinMaxScaler()
x=mms.fit_transform(x)

In [64]:
tr_x,tt_x,tr_y,tt_y=train_test_split(x,y,train_size=0.8,shuffle=False)
print(tr_x.shape,tt_x.shape,tr_y.shape,tt_y.shape)

(3064, 1) (766, 1) (3064,) (766,)


In [65]:
model=RandomForestClassifier(random_state=10)
model.fit(tr_x,tr_y)

In [66]:
y_pred=model.predict(tt_x)
print(accuracy_score(tt_y,y_pred))
print(roc_auc_score(tt_y,y_pred))
print(f1_score(tt_y,y_pred))
print(confusion_matrix(tt_y,y_pred))

0.4804177545691906
0.4818306010928962
0.47493403693931396
[[188 178]
 [220 180]]


In [69]:
y_pred,tt_y

(array([1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
        0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
        0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1,
        0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
        1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1,
        1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
        1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
        1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1,
        0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1,
        1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,
        1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0,
        0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1,
        0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
        1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 

In [71]:
sentiment_daily.to_csv('sentimentdaily.csv',index=False)

In [72]:
merged_data.to_csv('merged_data.csv',index=False)

In [73]:
pd.read_csv('sentimentdaily.csv')

Unnamed: 0,날짜,sentiment
0,2010-01-01,0.983607
1,2010-01-02,1.000000
2,2010-01-03,1.072727
3,2010-01-04,1.070485
4,2010-01-05,0.918103
...,...,...
5655,2025-06-26,1.096296
5656,2025-06-27,1.032787
5657,2025-06-28,1.058824
5658,2025-06-29,0.920000


In [74]:
pd.read_csv('merged_data.csv')

Unnamed: 0,날짜,sentiment,원/달러(종가 15:30),변화율,상승
0,2010-01-04,1.070485,1154.8,-0.012383,0
1,2010-01-05,0.918103,1140.5,-0.003595,0
2,2010-01-06,1.003676,1136.4,-0.000880,0
3,2010-01-07,1.023810,1135.4,-0.004316,0
4,2010-01-08,0.992278,1130.5,-0.009465,0
...,...,...,...,...,...
3825,2025-06-24,1.052846,1360.2,0.001617,1
3826,2025-06-25,1.041176,1362.4,-0.004037,0
3827,2025-06-26,1.096296,1356.9,0.000368,1
3828,2025-06-27,1.032787,1357.4,-0.005452,0
