<a href="https://colab.research.google.com/github/yeonghun00/stock-notes/blob/main/useful/all_in_one_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
!pip install exchange_calendars



In [39]:
!pip install finance-datareader



In [40]:
import pandas as pd
import numpy as np
import requests
import datetime
import exchange_calendars as ecals # 개장일만
from io import StringIO
import matplotlib.pyplot as plt
import FinanceDataReader as fdr
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import MinMaxScaler
import nltk
import requests
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET

In [41]:
XKRX = ecals.get_calendar("XKRX") # 한국 코드

250일 등락률, 거래대금 90-99: 범인매매

60일 등락률 50-100, 거래대금 10-50 : 조용히 오르는 애들 (내꺼)



In [42]:
class StockList():
  def __init__(self, period=250, increased=[.9, .99], traded=[.9, .99], pre_period=0):
    self.period = period
    self.increased = increased
    self.traded = traded
    self.pre_period = pre_period # 시작기점

    self.price_dic = {}

    self.start, self.today = self.get_date()
    self.df = self.get_stock_df()
    self.filtered_df = self.get_filtered_df()
    self.result_df = self.get_result_df()

  def get_date(self):
    today = datetime.date.today().strftime('%Y%m%d')
    if self.pre_period != 0:
      today = (datetime.date.today() - datetime.timedelta(days=self.pre_period)).strftime('%Y%m%d')
    start = (datetime.date.today() - datetime.timedelta(days=self.period)).strftime('%Y%m%d')

    if XKRX.is_session(today) == False:
      today = XKRX.previous_open(today).strftime('%Y%m%d')
    if XKRX.is_session(start) == False:
      start = XKRX.next_open(start).strftime('%Y%m%d')
    return start, today

  def get_stocks(self, market='STK'):
    data = {
      'mktId': market,
      'strtDd': self.start,
      'endDd': self.today,
      'money': '1',
      'adjStkPrc': '2',
      'adjStkPrc_check': 'Y',
      'share': '1',
      'csvxls_isNo': 'false',
      'name': 'fileDown',
      'url': 'dbms/MDC/STAT/standard/MDCSTAT01602'
    }
    gen_url = 'http://data.krx.co.kr/comm/fileDn/GenerateOTP/generate.cmd'
    gen_key = requests.post(gen_url, data=data)

    down_url = 'http://data.krx.co.kr/comm/fileDn/download_csv/download.cmd'
    r = requests.post(down_url, data={'code':gen_key.text})
    r.encoding = 'EUC-KR'
    return pd.read_csv(StringIO(r.text))

  def get_stock_df(self):
    return pd.concat([self.get_stocks(), self.get_stocks('KSQ')]).reset_index(drop=True)

  def get_filtered_df(self):
    traded_df = self.df[(self.df['거래대금'] < self.df['거래대금'].quantile(self.traded[1])) & (self.df['거래대금'] > self.df['거래대금'].quantile(self.traded[0]))]
    increased_df = self.df[(self.df['등락률'] > self.df['등락률'].quantile(self.increased[0])) & (self.df['등락률'] < self.df['등락률'].quantile(self.increased[1]))]
    selected = pd.Series(np.intersect1d(traded_df['종목명'].values, increased_df['종목명'].values))
    return self.df[self.df['종목명'].isin(selected)].sort_values('등락률', ascending=False).head(20)

  def get_sharpe(self, df):
    change = df['Change']+1
    return change.mean()/change.std()

  def get_sortino(self, df):
    change = df['Change']+1
    return change.mean()/(change[change<1]).std()

  def get_position(self, df):
    return df['Close'][-1]/df['Close'].max()

  def get_future_mdd(self, s):
    peak = s.iloc[0]
    max_drawdown = 0
    for price in s:
        if price > peak:
            peak = price
        drawdown = (peak - price) / peak
        if drawdown > max_drawdown:
            max_drawdown = drawdown
    return max_drawdown

  def get_hashtags(self, code):
    page = 5
    li = []
    for i in range(1, page):
      url = 'https://finance.naver.com/item/news_news.naver?code=' + code + '&page=' + str(i) + '&sm=title_entity_id.basic&clusterId='
      result = requests.get(url, headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'})
      bs_obj = BeautifulSoup(result.content, "html.parser")
      tds = bs_obj.find_all('td', {'class': 'title'})
      texts = [td.text.strip() for td in tds]
      li.extend([sentence.replace('[', ' ').replace(']', ' ').replace('…', ' ').replace('·', ' ').replace('"', ' ').replace('“', ' ').replace(',', ' ').replace('...', ' ').replace("'", ' ').replace('‘', ' ').replace('’', ' ') for sentence in texts])

    words = [word for sentence in li for word in sentence.split()]
    stop_words = ['%', '에', '전망', '수혜', '추가', '주가', '주식', '특징주', \
                  '코스닥', '코스피', '시총', '전년비', '1분기', '2분기', '3분기', \
                  '4분기', '전년比', '↑', '지난해', '영업익', '대비', '증가', '영업이익',\
                  '규모', '속', '주당', '매출', '/', '소', '전년', '1Q', '2Q', '3Q', \
                  '4Q', '관련', '데이터로', '보는', '증시', '영향', '등']
    words = [word for word in words if word not in stop_words]
    word_frequencies = nltk.FreqDist(words)
    keywords = word_frequencies.most_common(11)
    return ' '.join(['#'+x[0] for x in keywords[1:]])

  # 위에 고장 시 사용
  def get_hashtags2(self, code):
    name = self.df[self.df['종목코드'] == code]['종목명'].values[0]
    url = 'https://news.google.com/rss/search?q="' + name + '"&hl=ko&gl=KR&ceid=KR%3Ako'
    res = requests.get(url)
    res.encoding = 'utf-8'
    content = res.text
    root = ET.fromstring(content)
    title_tags = root.findall(".//title")
    titles = [title_tag.text for title_tag in title_tags]
    titles = [sentence.replace('[', ' ').replace(']', ' ').replace('…', ' ').replace('·', ' ').replace('"', ' ').replace('“', ' ').replace(',', ' ').replace('...', ' ').replace("'", ' ').replace('‘', ' ').replace('’', ' ') for sentence in titles]
    titles = [title for title in titles if name in title]
    titles = [item.split('-')[0].strip() for item in titles]
    words = [word for sentence in titles for word in sentence.split()]
    stop_words = ['%', '에', '전망', '수혜', '추가', '주가', '주식', '특징주', \
                  '코스닥', '코스피', '시총', '전년비', '1분기', '2분기', '3분기', \
                  '4분기', '전년比', '↑', '지난해', '영업익', '대비', '증가', '영업이익',\
                  '규모', '속', '주당', '매출', '/', '소', '전년', '1Q', '2Q', '3Q', \
                  '4Q', '관련', '데이터로', '보는', '증시', '영향', '등', '이번주', '주목', \
                  '하나증권', '목표주가', '리포트', '프리핑', '목표가', '000원', '장중']
    words = [word for word in words if word not in stop_words]
    word_frequencies = nltk.FreqDist(words)
    keywords = word_frequencies.most_common(11)
    return ' '.join(['#'+x[0] for x in keywords[1:]])

  def get_price_df(self, codes):
    d = {}
    for i in codes:
      d[i] = fdr.DataReader(str(i), self.start, self.today)
    return d

  def get_result_df(self, hashtags=False):
    self.price_dic = self.get_price_df(self.filtered_df['종목코드'])

    if (self.pre_period != 0):
      future_dic = {}
      for i in self.filtered_df['종목코드']:
        future = (datetime.date.today() + datetime.timedelta(days=self.pre_period)).strftime('%Y%m%d')
        future_dic[i] = fdr.DataReader(str(i), self.today, future)
      self.filtered_df['mdd'] = [round(self.get_future_mdd(future_dic[x]['Close']), 4) for x in future_dic]
      self.filtered_df['Performance'] = [round((future_dic[x]['Close'][-1]/future_dic[x]['Close'][0]), 4) for x in future_dic]
      self.filtered_df['preperiod'] = [self.pre_period for x in future_dic]
    else:
      self.filtered_df['mdd'] = [0 for x in self.price_dic]
      self.filtered_df['Performance'] = [0 for x in self.price_dic]
      self.filtered_df['preperiod'] = [self.pre_period for x in self.price_dic]

    self.filtered_df['Sharpe'] = [self.get_sharpe(self.price_dic[x]) for x in self.price_dic]
    self.filtered_df['Sortino'] = [self.get_sortino(self.price_dic[x]) for x in self.price_dic]
    self.filtered_df['Position'] = [self.get_position(self.price_dic[x]) for x in self.price_dic]

    if hashtags == True:
      self.filtered_df['Hashtags'] = [self.get_hashtags(x) for x in self.price_dic.keys()]
    else:
      self.filtered_df['Hashtags'] = [0 for x in self.price_dic.keys()]

    t = self.filtered_df.sort_values('Sharpe', ascending=False)

    scaler = MinMaxScaler(feature_range=(5, 10))

    t['Position'] = scaler.fit_transform(t[['Position']]).round(1)
    t['Sortino'] = scaler.fit_transform(t[['Sortino']]).round(1)
    t['Sharpe'] = scaler.fit_transform(t[['Sharpe']]).round(1)
    t['거래대금'] = scaler.fit_transform(t[['거래대금']]).round(1)
    t['등락률'] = scaler.fit_transform(t[['등락률']]).round(1)
    t['합산'] = t['Position'] + t['Sortino'] + t['Sharpe'] + t['거래대금'] + t['등락률']
    t['합산'] = scaler.fit_transform(t[['합산']]).round(1)
    t['preperiod'] = t[['preperiod']]
    t['Performance'] = t[['Performance']]
    t['mdd'] = t[['mdd']]

    t = t[['종목명', '종료일 종가', '등락률', '거래대금',	'Sharpe',	'Sortino',	'Position', '합산', 'Performance', 'Hashtags', 'preperiod', 'mdd']].reset_index(drop=True).rename({'종료일 종가':'현재가', '등락률':'파워', '거래대금':'관심도', 'Sharpe':'Risk1', 'Sortino':'Risk2', 'Position':'모멘텀'}, axis=1)
    t.index+=1
    t.index.name='순위'

    return t

In [43]:
# k
stocklist = StockList(365, [.98,1], [.99,1]) # 원래 250인데 365로 바꿈
t = stocklist.filtered_df
t = t.sort_values(['거래대금'], ascending=[False]).head(3)

In [44]:
# k trend

d = {}
d = stocklist.get_price_df(t['종목코드'].values)

merged_df = pd.DataFrame()
num_intervals = 15

num_intervals = 15

for key, df in d.items():
    increased_ratio = df['Close'] / df['Close'].iloc[0]
    interval_size = int((len(increased_ratio) - 1) / (num_intervals - 1))
    row_indices = list(range(0, len(increased_ratio), interval_size))
    row_indices[-1] = len(increased_ratio) - 1
    selected_ratio = increased_ratio.iloc[row_indices]
    merged_df[key] = selected_ratio

merged_df

Unnamed: 0_level_0,086520,001570,022100
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-09-13,1.0,1.0,1.0
2022-10-07,0.957769,0.892655,0.850382
2022-11-02,1.072509,1.799435,0.885496
2022-11-25,1.037448,1.731638,0.974046
2022-12-20,0.918722,1.418079,0.970992
2023-01-13,0.943381,1.420904,0.978626
2023-02-09,1.567653,1.491525,1.025954
2023-03-07,2.395367,3.0,1.070229
2023-03-30,4.08935,4.463277,1.355725
2023-04-24,4.90558,3.59322,2.291603


In [46]:
t = d['086520']

In [47]:
t['Change']

Date
2022-09-13    0.006415
2022-09-14    0.070918
2022-09-15    0.082583
2022-09-16   -0.030928
2022-09-19   -0.068788
                ...   
2023-09-07   -0.041905
2023-09-08    0.014911
2023-09-11   -0.040157
2023-09-12   -0.051020
2023-09-13   -0.033333
Name: Change, Length: 251, dtype: float64

In [60]:
stocklist.start

'20230106'

In [62]:
stocklist.today

'20230913'

In [48]:
top5 = np.percentile(t['Change'], 95)
top5

0.1164071047791978

In [49]:
top_t = t[t['Change'] > top5]
top_t

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-02-07,147200,178100,147000,165900,3892423,0.134747
2023-02-16,220500,242000,212500,239000,2139755,0.119438
2023-02-27,238500,283000,238000,280000,2135095,0.157025
2023-03-08,285000,336000,284500,334000,2282624,0.143836
2023-03-13,308500,364000,308500,361500,2762413,0.171799
2023-03-15,405000,454500,388500,448000,3092531,0.207547
2023-03-29,431500,499000,423000,498500,2754691,0.149942
2023-04-10,591000,744000,586000,722000,3540854,0.246978
2023-04-27,599000,709000,592000,709000,2661112,0.183639
2023-06-07,570000,662000,569000,662000,1780225,0.169611


In [53]:
top_t.index.to_series().diff()

Date
2023-02-07       NaT
2023-02-16    9 days
2023-02-27   11 days
2023-03-08    9 days
2023-03-13    5 days
2023-03-15    2 days
2023-03-29   14 days
2023-04-10   12 days
2023-04-27   17 days
2023-06-07   41 days
2023-07-03   26 days
2023-07-18   15 days
2023-07-28   10 days
Name: Date, dtype: timedelta64[ns]

In [57]:
average_interval = top_t.index.to_series().diff().mean().days
print(average_interval)


14


In [None]:
# 상위 10% Change average
# 평균 기간
# 평균 위치
# 점수화

In [8]:
from google.colab import files

merged_df.to_csv('king_trend_df.csv', encoding="utf-8-sig")
files.download('king_trend_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [9]:
t = t[['종목명','등락률','거래대금']].reset_index(drop=True)
t.index+=1
t.index.name='순위'
total_traded = t['거래대금'].sum()
t['거래대금'] = (t['거래대금'] / total_traded) * 100
t

Unnamed: 0_level_0,종목명,등락률,거래대금
순위,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,에코프로,651.29,60.959269
2,금양,583.09,22.028834
3,포스코DX,718.04,17.011897


In [10]:
from google.colab import files

t.to_csv('king_df.csv', encoding="utf-8-sig")
files.download('king_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [58]:
# Ai
stocklist = StockList()
t = stocklist.get_result_df(hashtags=True).head(15)
t = t.drop(['preperiod', 'Performance', 'mdd'], axis=1)
t

Unnamed: 0_level_0,종목명,현재가,파워,관심도,Risk1,Risk2,모멘텀,합산,Hashtags
순위,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,칩스앤미디어,37450,5.0,5.1,10.0,7.2,9.0,7.6,#자사주 #16억 #결정 #AI #취득 #신탁계약 #감소 #확대 #IP #45.7%↑
2,하나마이크론,27450,6.0,7.3,10.0,10.0,9.3,10.0,#금융지원 #설비 #한국씨티은행 #반도체 #급등 #임대 #계약 #해외투자 #소식에 ...
3,가온칩스,42950,6.8,6.5,9.8,7.6,9.0,8.9,#ARM #삼성전자 #인수 #반도체 #가능성에 #관련주 #상승 #강세 #ASIC #설계
4,ISC,83800,5.1,6.1,9.6,6.7,8.5,7.4,#SKC #인수 #반도체 #테스트 #솔루션 #소켓 #기업 #아이에스시(ISC) #아...
5,큐렉소,19830,6.1,6.7,9.1,7.8,8.8,8.4,#의료로봇 #인공관절 #로봇 #판매 #수술로봇 #체결 #뉴로메카 #계약 #美 #수술
6,실리콘투,7830,7.4,6.0,8.8,8.3,9.1,8.8,#유럽 #소식에 #단기차입금 #글로벌이 #주목하는 #K-뷰티 #수익성 #안정성↑-한...
7,유진로봇,13500,8.0,5.7,8.4,9.5,9.0,9.2,#시장 #물류로봇 #진출 #고카트 #유럽 #유진로봇과 #자율주행 #LGU+ #체결 ...
8,레이크머티리얼즈,14930,7.4,10.0,7.9,6.2,7.2,8.5,#생산 #공시 #83억 #전고체 #핵심 #시설투자 #기관 #외국인 #7.7%↑ #케일럼
9,인벤티지랩,26550,6.3,5.0,7.5,6.8,10.0,7.3,#호주 #특허 #치료제 #LNP #마약 #등록 #주사제 #알코올 #임상1상 #장기지속형
10,삼부토건,3700,5.8,9.4,7.3,7.4,8.2,8.3,#재건 #우크라이나 #우크라 #디와이디 #민간임대주택 #강세 #재건사업 #참여 #신...


In [12]:
t.to_csv('ai_df.csv', encoding="utf-8-sig")
files.download('ai_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [13]:
# leading
# 2주/4주 간격 업데이트
exclude_list = t.head(3)['종목명'].values
leading_df = t[~t['종목명'].isin(exclude_list)]
data = {
    '날짜': ['2022', '2022', '2022'],
    '종목명': leading_df.sort_values('합산', ascending=False).head(3)['종목명'].values,
    '매수가': [0, 0, 0],
    '목표가': [0, 0, 0],
    '손절가': [0, 0, 0],
    '목표수익률': [0, 0, 0],
}
leading_df = pd.DataFrame(data)
leading_df

Unnamed: 0,날짜,종목명,매수가,목표가,손절가,목표수익률
0,2022,유진로봇,0,0,0,0
1,2022,실리콘투,0,0,0,0
2,2022,큐렉소,0,0,0,0


In [14]:
leading_df.to_csv('leading_df.csv', encoding="utf-8-sig")
files.download('leading_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:
# performance
all_df = []
for p in [20, 60, 120]:
  stocklist = StockList(period=250, increased=[.9, .99], traded=[.9, .99], pre_period=p)
  t = stocklist.get_result_df(hashtags=False).head(15)
  t = t[['종목명', '현재가', 'mdd', 'Performance', 'preperiod']].reset_index(drop=True)
  all_df.append(t)
t = pd.concat(all_df)
t = t.reset_index(drop=True)
t.index.name='순위'
t.index+=1
t

Unnamed: 0_level_0,종목명,현재가,mdd,Performance,preperiod
순위,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,이오테크닉스,167800,0.1514,0.9154,20
2,주성엔지니어링,27150,0.0866,0.9926,20
3,HPSP,32550,0.125,0.9785,20
4,가온칩스,43850,0.1078,0.9818,20
5,ISC,102200,0.18,0.82,20
6,큐렉소,24400,0.1803,0.8217,20
7,실리콘투,8300,0.1349,0.9349,20
8,삼아알미늄,103900,0.0896,0.9192,20
9,레이크머티리얼즈,16460,0.123,0.9052,20
10,삼부토건,3330,0.128,1.0931,20


In [16]:
t.to_csv('performance_df.csv', encoding="utf-8-sig")
files.download('performance_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [17]:
# sentiments

!pip install newsapi-python
!pip install vaderSentiment

Collecting newsapi-python
  Downloading newsapi_python-0.2.7-py2.py3-none-any.whl (7.9 kB)
Installing collected packages: newsapi-python
Successfully installed newsapi-python-0.2.7
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [18]:
from newsapi import NewsApiClient
import datetime
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from wordcloud import WordCloud
import re

api_key = '6147ad35d10843b2949edc41cd955155'
api = NewsApiClient(api_key=api_key)

In [19]:
# business general

def get_headline_sentiment(category='business'):
    df = api.get_top_headlines(country='us', category=category, page_size=100)
    df = pd.DataFrame(df['articles'])
    df['date'] = pd.to_datetime(df['publishedAt'])
    df['date'] = df['date'].dt.floor('4H')

    analyzer = SentimentIntensityAnalyzer()

    # Vader Polarity
    df = pd.concat([df, pd.DataFrame([analyzer.polarity_scores(text) for text in df['title']])], axis=1)

    # TextBlob Subjectivity
    df['subjectivity'] = [TextBlob(text).sentiment.subjectivity for text in df['title']]
    df = df.fillna(0)
    return df['neg'].mean() * 100 + 50, df['pos'].mean() * 100 + 50, df['compound'].mean() * 100 + 50

def get_keyword_sentiment(keyword='nasdaq'):
  end_date = datetime.datetime.now().strftime('%Y-%m-%d')
  start_date = (datetime.datetime.now() - datetime.timedelta(days=3)).strftime('%Y-%m-%d')

  df = api.get_everything(q=keyword, language='en', page_size=100, from_param=start_date, to=end_date)
  df = pd.DataFrame(df['articles'])
  df['date'] = pd.to_datetime(df['publishedAt'])
  df['date'] = df['date'].dt.floor('4H')

  analyzer = SentimentIntensityAnalyzer()

  df = pd.concat([df, pd.DataFrame([analyzer.polarity_scores(text) for text in df['title']])], axis=1)

  df['subjectivity'] = [TextBlob(text).sentiment.subjectivity for text in df['title']]
  df = df.fillna(0)
  return df['neg'].mean() * 100 + 50, df['pos'].mean() * 100 + 50, df['compound'].mean() * 100 + 50


In [20]:
business_sentiment = get_headline_sentiment()
general_sentiment = get_headline_sentiment('general')
nasdaq_sentiment = get_keyword_sentiment('Nasdaq')
snp500_sentiment = get_keyword_sentiment('S&P 500')
dowjones_sentiment = get_keyword_sentiment('Dow Jones')

neg = (business_sentiment[0] + general_sentiment[0] + nasdaq_sentiment[0] + snp500_sentiment[0] + dowjones_sentiment[0]) / 5
pos = (business_sentiment[1] + general_sentiment[1] + nasdaq_sentiment[1] + snp500_sentiment[1] + dowjones_sentiment[1]) / 5
compound = (business_sentiment[2] + general_sentiment[2] + nasdaq_sentiment[2] + snp500_sentiment[2] + dowjones_sentiment[2]) / 5

# Optional: If you want to convert neg, pos, compound to integers, you can use round() or int()
neg = int(round(neg))
pos = int(round(pos))
compound = int(round(compound))

In [21]:
data = {'positive': [34, 42, 34, 33, 11, 77, pos],
        'negative': [12, 35, 12, 55, 11, 13, neg],
        'compound': [22, 10, 6, 10, 23, 84, compound]}

df = pd.DataFrame(data, index=['2022/11/11', '2022/11/12', '2022/11/13', '2022/11/14', '2022/11/15', '2022/11/16', '2022/11/17'])
df.index.name = 'date'
df

Unnamed: 0_level_0,positive,negative,compound
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022/11/11,34,12,22
2022/11/12,42,35,10
2022/11/13,34,12,6
2022/11/14,33,55,10
2022/11/15,11,11,23
2022/11/16,77,13,84
2022/11/17,58,56,54


In [22]:
df.to_csv('sentiment_df.csv', encoding="utf-8-sig")
files.download('sentiment_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [23]:
# word cloud
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.4.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (465 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.3/465.3 kB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.1 konlpy-0.6.0


In [24]:
from multiprocessing import Pool
from konlpy.tag import Hannanum
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime
import html

class News:
  def __init__(self):
    self.headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'}

  def get_ranking(self, page=1, date='1'):
    date = str(datetime.datetime.now().date().strftime("%Y%m%d")) if date == '1' else date
    pages = list(range(1,page+1))
    li = []
    for page in pages:
      url = 'https://finance.naver.com/news/news_list.nhn?mode=RANK&date=' + date + '&page=' + str(page)
      result = requests.get(url, headers = self.headers)
      bs_obj = BeautifulSoup(result.content, "html.parser")

      div = bs_obj.find_all('div', {'class':'hotNewsList'})
      li+=[html.unescape(x['href']) for x in div[0].find_all('a')]
    return li

  def get_article(self, url):
    url = 'https://finance.naver.com/' + url
    result = requests.get(url, headers = self.headers)
    bs_obj = BeautifulSoup(result.content, "html.parser")

    title = ' '.join(bs_obj.find_all('div', {'class': 'article_info'})[0].find_all('h3')[0].get_text().split())
    context = bs_obj.find_all('div', {'class': 'articleCont'})[0]
    try:
      subheading = context.find_all('strong')[0].get_text()
    except:
      subheading = ''
    t = ' '.join(context.get_text().split())
    text = t[:t.find('@')]
    text = text.replace(subheading, '')
    return title, subheading, text

news = News()
news_list = news.get_ranking(5)
with Pool() as p:
    li = p.map(news.get_article, news_list)
li = np.array(li)

dic = {'Title':li[:,0], 'Subheading':li[:,1], 'Text':li[:,2]}
df = pd.DataFrame.from_dict(dic)
df

Unnamed: 0,Title,Subheading,Text
0,"""손주 학원비 그까이꺼""…'황혼 육아'에 지갑 여는 노인들 [조미현의 Fin코노미]",,60대 이상 노인층의 육아 관련 소비가 크게 늘어난 것으로 나타났습니다. 직장을 다...
1,"""부동산 투자 조심"" 경고도 안 먹혔다…영끌족 '우르르' [강진규의 데이터너머]",50년 주담대 빚잔치에 가계대출 급증,한 시중은행의 대출창구. 사진과 기사는 무관합니다. / 사진=연합뉴스은행권 가계대출...
2,나이키가 어쩌다가 이 지경…감당 안되는 재고에 시총 반토막,,中실적 부진에 재고 4년 평균치 웃돌아2년새 시총 46% 증발해 377조→197조 ...
3,“제발 상장폐지 해주세요”… 삼목에스폼 주주들은 왜 성수동 회장님 집으로 몰려갔나,,알루미늄 거푸집 제조사 삼목에스폼의 소액주주들이 회사 측에 자진 상장폐지를 요구하고...
4,대출 상환능력 문턱 높인다…스트레스 DSR 적용,"50년 주담대, DSR 산정시 40년 적용상환능력 고려해 50년 적용도 가능변동금리...","50년 주담대, DSR 산정시 40년 적용상환능력 고려해 50년 적용도 가능변동금리..."
...,...,...,...
120,"현대차, 노사 잠정 합의…주가 최대 악재 소멸 - 다올",,현대자동차 노사 교섭대표들이 지난 6월 13일 현대차 울산공장 본관 동행룸에서 '2...
121,국민연금 주식투자 늘린다는데…증시 호재될까,"전문가, 매년 11.9조 자금 유입 전망",[서울=뉴시스] 최진석 = 김용하 재정계산위원회 위원장이 지난 1일 서울 강남구 코...
122,테마 인버스 ETF 출시 본격화…“투자자 선택 확대 vs 하락 부추겨”,"KBSTAR 2차전지TOP10인버스, 상장 첫날 2%↑NH아문디운용, 반도체 인버스...",ⓒ게티이미지뱅크[데일리안 = 노성인 기자] 2차전지 등 업종·테마주의 주가 하락에 ...
123,"""연금리 7%·월 납입한도 300만원""…수협 '멸치적금' 대박 났다",,/사진=뉴스1상호금융조합인 수협에서 특판으로 선보인 이른바 '멸치 적금'이 판매 시...


In [25]:
# 위에 고장 시 사용
import re

# url = 'https://news.google.com/rss/topics/CAAqIggKIhxDQkFTRHdvSkwyMHZNR2RtY0hNekVnSnJieWdBUAE?hl=ko&gl=KR&ceid=KR%3Ako' 경제

url = 'https://news.google.com/rss/topics/CAAqIQgKIhtDQkFTRGdvSUwyMHZNREpmTjNRU0FtdHZLQUFQAQ?hl=ko&gl=KR&ceid=KR%3Ako' # 금융
res = requests.get(url)
res.encoding = 'utf-8'
content = res.text
root = ET.fromstring(content)
title_tags = root.findall(".//title")
titles = [title_tag.text for title_tag in title_tags]
titles = [sentence.replace('↓', ' ').replace('↓',' ').replace('(', ' ').replace(')', ' ').replace('[', ' ').replace(']', ' ').replace('…', ' ').replace('·', ' ').replace('"', ' ').replace('“', ' ').replace(',', ' ').replace('...', ' ').replace("'", ' ').replace('‘', ' ').replace('’', ' ').replace('”', ' ').replace('＂', ' ').replace('?', ' ').replace('‥', ' ').replace('!', ' ') for sentence in titles]
titles = [item.split('-')[0].strip() for item in titles]

In [26]:
cleaned = [re.split(r'\s*[-/|ㅣ]\s*', title)[0].strip() for title in titles]
cleaned = [' '.join([word for word in item.split() if not any(keyword in word for keyword in ['경제', '뉴스', '신문'])]) for item in cleaned[1:]]
# df = pd.DataFrame({'Title': cleaned})
# df

In [27]:
import random
import re

hannanum = Hannanum()

def clean_title(title):
    cleaned_title = title.replace(',', ' ') \
                         .replace('(', ' ') \
                         .replace(')', ' ') \
                         .replace('…', ' ') \
                         .replace('?', ' ') \
                         .replace('.', ' ') \
                         .replace("'", ' ') \
                         .replace('"', ' ') \
                         .replace('”', ' ') \
                         .replace('“', ' ') \
                         .replace('‘', ' ') \
                         .replace('’', ' ') \
                         .replace('·', ' ') \
                         .replace('[', ' ') \
                         .replace(']', ' ') \
                         .replace('↓', ' ') \
                         .replace('→', ' ') \
                         .replace('↑', ' ')
    return cleaned_title

def rank_words(words_list):
  word_ranking = {}
  for word in words_list:
      if word in word_ranking:
          word_ranking[word] += 1
      else:
          word_ranking[word] = 1
  sorted_word_ranking = dict(sorted(word_ranking.items(), key=lambda item: item[1], reverse=True))
  return sorted_word_ranking

def standardize_values(dictionary):
    max_value = max(dictionary.values())
    min_value = min(dictionary.values())

    for key in dictionary:
        value = dictionary[key]
        standardized_value = 10 + (value - min_value) * (90 / (max_value - min_value))
        dictionary[key] = round(standardized_value)

    return dictionary

def custom_standardize_values(dictionary, new_min, new_max, noise_min, noise_max):
    max_value = max(dictionary.values())
    min_value = min(dictionary.values())

    standardized_dict = {}
    for key, value in dictionary.items():
        # Linear transformation to scale the values
        standardized_value = new_min + (value - min_value) * (new_max - new_min) / (max_value - min_value)

        # Add random noise
        noise = random.uniform(noise_min, noise_max)
        standardized_value_with_noise = standardized_value + noise

        standardized_dict[key] = round(standardized_value_with_noise, 0)

    return standardized_dict

def extract_nouns(text, custom_nouns_list):
    words = re.findall(r'\b\w+\b', text)

    filtered_nouns = {}
    for word in words:
        if word in custom_nouns_list:
            filtered_nouns[word] = filtered_nouns.get(word, 0) + 1

    return filtered_nouns

In [28]:
df['Cleaned_Title'] = df['Title'].apply(clean_title)
df['Filtered_Pos'] = [hannanum.nouns(x) for x in df['Cleaned_Title']]
words = [word for sublist in df['Filtered_Pos'] for word in sublist]
word_frequency_dict = rank_words(words)
result = standardize_values(word_frequency_dict)

In [29]:
stop_words = ['의', '등', '때', '오늘', '이곳', '저', '월', '년', '일', '로', '어디', '데', '특징주', '장중', '전', '주', '위', '한', '수',' 속', '전', '추', '속', '내', '비트코']
data_dict = {key: value for key, value in result.items() if key not in stop_words}
data_dict = {key: value for key, value in data_dict.items() if not str(key).isdigit()}

In [30]:
df

Unnamed: 0,Title,Subheading,Text,Cleaned_Title,Filtered_Pos
0,"""손주 학원비 그까이꺼""…'황혼 육아'에 지갑 여는 노인들 [조미현의 Fin코노미]",,60대 이상 노인층의 육아 관련 소비가 크게 늘어난 것으로 나타났습니다. 직장을 다...,손주 학원비 그까이꺼 황혼 육아 에 지갑 여는 노인들 조미현의 Fin코노미,"[손주, 학원비, 그까이꺼, 황혼, 육아, 지갑, 노인들, 조미현, Fin코노미]"
1,"""부동산 투자 조심"" 경고도 안 먹혔다…영끌족 '우르르' [강진규의 데이터너머]",50년 주담대 빚잔치에 가계대출 급증,한 시중은행의 대출창구. 사진과 기사는 무관합니다. / 사진=연합뉴스은행권 가계대출...,부동산 투자 조심 경고도 안 먹혔다 영끌족 우르르 강진규의 데이터너머,"[부동산, 투, 경고, 영끌족, 강진규, 데이터너머]"
2,나이키가 어쩌다가 이 지경…감당 안되는 재고에 시총 반토막,,中실적 부진에 재고 4년 평균치 웃돌아2년새 시총 46% 증발해 377조→197조 ...,나이키가 어쩌다가 이 지경 감당 안되는 재고에 시총 반토막,"[나이키, 지경, 감당, 재고, 시총, 반토막]"
3,“제발 상장폐지 해주세요”… 삼목에스폼 주주들은 왜 성수동 회장님 집으로 몰려갔나,,알루미늄 거푸집 제조사 삼목에스폼의 소액주주들이 회사 측에 자진 상장폐지를 요구하고...,제발 상장폐지 해주세요 삼목에스폼 주주들은 왜 성수동 회장님 집으로 몰려갔나,"[상장폐지, 삼목에스폼, 주주들, 성수동, 회장님, 집]"
4,대출 상환능력 문턱 높인다…스트레스 DSR 적용,"50년 주담대, DSR 산정시 40년 적용상환능력 고려해 50년 적용도 가능변동금리...","50년 주담대, DSR 산정시 40년 적용상환능력 고려해 50년 적용도 가능변동금리...",대출 상환능력 문턱 높인다 스트레스 DSR 적용,"[대출, 상환능력, 문턱, 스트레스, 적용]"
...,...,...,...,...,...
120,"현대차, 노사 잠정 합의…주가 최대 악재 소멸 - 다올",,현대자동차 노사 교섭대표들이 지난 6월 13일 현대차 울산공장 본관 동행룸에서 '2...,현대차 노사 잠정 합의 주가 최대 악재 소멸 - 다올,"[현대차, 노사, 잠정, 합, 주, 최대, 악재, 소멸, 다올]"
121,국민연금 주식투자 늘린다는데…증시 호재될까,"전문가, 매년 11.9조 자금 유입 전망",[서울=뉴시스] 최진석 = 김용하 재정계산위원회 위원장이 지난 1일 서울 강남구 코...,국민연금 주식투자 늘린다는데 증시 호재될까,"[국민연금, 주식투자, 증시, 호재]"
122,테마 인버스 ETF 출시 본격화…“투자자 선택 확대 vs 하락 부추겨”,"KBSTAR 2차전지TOP10인버스, 상장 첫날 2%↑NH아문디운용, 반도체 인버스...",ⓒ게티이미지뱅크[데일리안 = 노성인 기자] 2차전지 등 업종·테마주의 주가 하락에 ...,테마 인버스 ETF 출시 본격화 투자자 선택 확대 vs 하락 부추겨,"[테마, 인버스, 출시, 본격화, 투자자, 선택, 확대, 하락, 부추겨]"
123,"""연금리 7%·월 납입한도 300만원""…수협 '멸치적금' 대박 났다",,/사진=뉴스1상호금융조합인 수협에서 특판으로 선보인 이른바 '멸치 적금'이 판매 시...,연금리 7% 월 납입한도 300만원 수협 멸치적금 대박 났다,"[연금리, 7, 월, 납입한, 300만원, 수협, 멸치적금, 대박]"


In [31]:
#설정 종목명

title_as_sentence = ' '.join(df['Cleaned_Title'].values)
custom_nouns = stocklist.df['종목명'].values.tolist() + ['비트코인', '이더리움']
filtered_nouns = extract_nouns(title_as_sentence, custom_nouns)
ticker_dict = custom_standardize_values(filtered_nouns, 40, 80, 5, 15)

In [32]:
data_dict.update(ticker_dict)

In [33]:
t = pd.DataFrame(data_dict, index=['value']).T.sort_values(by='value', ascending=False).head(50)

In [34]:
t

Unnamed: 0,value
주담대,100.0
에코프로,93.0
현대차,82.0
50년,75.0
DSR,74.0
삼성전자,72.0
금양,67.0
비트코인,65.0
코스모신소재,63.0
특례보금자리론,59.0


In [35]:
from google.colab import files

t.to_csv('words_df.csv', header=False, encoding="utf-8-sig")
files.download('words_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>