<a href="https://colab.research.google.com/github/yeonghun00/stock-notes/blob/main/useful/all_in_one.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install exchange_calendars

Collecting exchange_calendars
  Downloading exchange_calendars-4.2.8-py3-none-any.whl (191 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m191.4/191.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyluach (from exchange_calendars)
  Downloading pyluach-2.2.0-py3-none-any.whl (25 kB)
Collecting korean-lunar-calendar (from exchange_calendars)
  Downloading korean_lunar_calendar-0.3.1-py3-none-any.whl (9.0 kB)
Installing collected packages: korean-lunar-calendar, pyluach, exchange_calendars
Successfully installed exchange_calendars-4.2.8 korean-lunar-calendar-0.3.1 pyluach-2.2.0


In [None]:
!pip install finance-datareader

Collecting finance-datareader
  Downloading finance_datareader-0.9.50-py3-none-any.whl (19 kB)
Collecting requests-file (from finance-datareader)
  Downloading requests_file-1.5.1-py2.py3-none-any.whl (3.7 kB)
Installing collected packages: requests-file, finance-datareader
Successfully installed finance-datareader-0.9.50 requests-file-1.5.1


In [2]:
import pandas as pd
import numpy as np
import requests
import datetime
import exchange_calendars as ecals # 개장일만
from io import StringIO
import matplotlib.pyplot as plt
import FinanceDataReader as fdr
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import MinMaxScaler
import nltk
import requests
from bs4 import BeautifulSoup

ModuleNotFoundError: ignored

In [None]:
XKRX = ecals.get_calendar("XKRX") # 한국 코드

250일 등락률, 거래대금 90-99: 범인매매

60일 등락률 50-100, 거래대금 10-50 : 조용히 오르는 애들 (내꺼)



In [None]:
class StockList():
  def __init__(self, period=250, increased=[.9, .99], traded=[.9, .99], pre_period=0):
    self.period = period
    self.increased = increased
    self.traded = traded
    self.pre_period = pre_period # 시작기점

    self.price_dic = {}

    self.start, self.today = self.get_date()
    self.df = self.get_stock_df()
    self.filtered_df = self.get_filtered_df()
    self.result_df = self.get_result_df()

  def get_date(self):
    today = datetime.date.today().strftime('%Y%m%d')
    if self.pre_period != 0:
      today = (datetime.date.today() - datetime.timedelta(days=self.pre_period)).strftime('%Y%m%d')
    start = (datetime.date.today() - datetime.timedelta(days=self.period)).strftime('%Y%m%d')

    if XKRX.is_session(today) == False:
      today = XKRX.previous_open(today).strftime('%Y%m%d')
    if XKRX.is_session(start) == False:
      start = XKRX.next_open(start).strftime('%Y%m%d')
    return start, today

  def get_stocks(self, market='STK'):
    data = {
      'mktId': market,
      'strtDd': self.start,
      'endDd': self.today,
      'money': '1',
      'adjStkPrc': '2',
      'adjStkPrc_check': 'Y',
      'share': '1',
      'csvxls_isNo': 'false',
      'name': 'fileDown',
      'url': 'dbms/MDC/STAT/standard/MDCSTAT01602'
    }
    gen_url = 'http://data.krx.co.kr/comm/fileDn/GenerateOTP/generate.cmd'
    gen_key = requests.post(gen_url, data=data)

    down_url = 'http://data.krx.co.kr/comm/fileDn/download_csv/download.cmd'
    r = requests.post(down_url, data={'code':gen_key.text})
    r.encoding = 'EUC-KR'
    return pd.read_csv(StringIO(r.text))

  def get_stock_df(self):
    return pd.concat([self.get_stocks(), self.get_stocks('KSQ')]).reset_index(drop=True)

  def get_filtered_df(self):
    traded_df = self.df[(self.df['거래대금'] < self.df['거래대금'].quantile(self.traded[1])) & (self.df['거래대금'] > self.df['거래대금'].quantile(self.traded[0]))]
    increased_df = self.df[(self.df['등락률'] > self.df['등락률'].quantile(self.increased[0])) & (self.df['등락률'] < self.df['등락률'].quantile(self.increased[1]))]
    selected = pd.Series(np.intersect1d(traded_df['종목명'].values, increased_df['종목명'].values))
    return self.df[self.df['종목명'].isin(selected)].sort_values('등락률', ascending=False).head(20)

  def get_sharpe(self, df):
    change = df['Change']+1
    return change.mean()/change.std()

  def get_sortino(self, df):
    change = df['Change']+1
    return change.mean()/(change[change<1]).std()

  def get_position(self, df):
    return df['Close'][-1]/df['Close'].max()

  def get_future_mdd(self, s):
    peak = s.iloc[0]
    max_drawdown = 0
    for price in s:
        if price > peak:
            peak = price
        drawdown = (peak - price) / peak
        if drawdown > max_drawdown:
            max_drawdown = drawdown
    return max_drawdown

  def get_hashtags(self, code):
    page = 5
    li = []
    for i in range(1, page):
      url = 'https://finance.naver.com/item/news_news.naver?code=' + code + '&page=' + str(i) + '&sm=title_entity_id.basic&clusterId='
      result = requests.get(url, headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'})
      bs_obj = BeautifulSoup(result.content, "html.parser")
      tds = bs_obj.find_all('td', {'class': 'title'})
      texts = [td.text.strip() for td in tds]
      li.extend([sentence.replace('[', ' ').replace(']', ' ').replace('…', ' ').replace('·', ' ').replace('"', ' ').replace('“', ' ').replace(',', ' ').replace('...', ' ').replace("'", ' ').replace('‘', ' ').replace('’', ' ') for sentence in texts])

    words = [word for sentence in li for word in sentence.split()]
    stop_words = ['%', '에', '전망', '수혜', '추가', '주가', '주식', '특징주', \
                  '코스닥', '코스피', '시총', '전년비', '1분기', '2분기', '3분기', \
                  '4분기', '전년比', '↑', '지난해', '영업익', '대비', '증가', '영업이익',\
                  '규모', '속', '주당', '매출', '/', '소', '전년', '1Q', '2Q', '3Q', \
                  '4Q', '관련', '데이터로', '보는', '증시', '영향', '등']
    words = [word for word in words if word not in stop_words]
    word_frequencies = nltk.FreqDist(words)
    keywords = word_frequencies.most_common(11)
    return ' '.join(['#'+x[0] for x in keywords[1:]])

  def get_price_df(self, codes):
    d = {}
    for i in codes:
      d[i] = fdr.DataReader(str(i), self.start, self.today)
    return d

  def get_result_df(self, hashtags=False):
    self.price_dic = self.get_price_df(self.filtered_df['종목코드'])

    if (self.pre_period != 0):
      future_dic = {}
      for i in self.filtered_df['종목코드']:
        future = (datetime.date.today() + datetime.timedelta(days=self.pre_period)).strftime('%Y%m%d')
        future_dic[i] = fdr.DataReader(str(i), self.today, future)
      self.filtered_df['mdd'] = [round(self.get_future_mdd(future_dic[x]['Close']), 4) for x in future_dic]
      self.filtered_df['Performance'] = [round((future_dic[x]['Close'][-1]/future_dic[x]['Close'][0]), 4) for x in future_dic]
      self.filtered_df['preperiod'] = [self.pre_period for x in future_dic]
    else:
      self.filtered_df['mdd'] = [0 for x in self.price_dic]
      self.filtered_df['Performance'] = [0 for x in self.price_dic]
      self.filtered_df['preperiod'] = [self.pre_period for x in self.price_dic]

    self.filtered_df['Sharpe'] = [self.get_sharpe(self.price_dic[x]) for x in self.price_dic]
    self.filtered_df['Sortino'] = [self.get_sortino(self.price_dic[x]) for x in self.price_dic]
    self.filtered_df['Position'] = [self.get_position(self.price_dic[x]) for x in self.price_dic]

    if hashtags == True:
      self.filtered_df['Hashtags'] = [self.get_hashtags(x) for x in self.price_dic.keys()]
    else:
      self.filtered_df['Hashtags'] = [0 for x in self.price_dic.keys()]

    t = self.filtered_df.sort_values('Sharpe', ascending=False)

    scaler = MinMaxScaler(feature_range=(5, 10))

    t['Position'] = scaler.fit_transform(t[['Position']]).round(1)
    t['Sortino'] = scaler.fit_transform(t[['Sortino']]).round(1)
    t['Sharpe'] = scaler.fit_transform(t[['Sharpe']]).round(1)
    t['거래대금'] = scaler.fit_transform(t[['거래대금']]).round(1)
    t['등락률'] = scaler.fit_transform(t[['등락률']]).round(1)
    t['합산'] = t['Position'] + t['Sortino'] + t['Sharpe'] + t['거래대금'] + t['등락률']
    t['합산'] = scaler.fit_transform(t[['합산']]).round(1)

    t = t[['종목명', '종료일 종가', '등락률', '거래대금',	'Sharpe',	'Sortino',	'Position', '합산', 'Performance', 'Hashtags']].reset_index(drop=True).rename({'종료일 종가':'현재가', '등락률':'파워', '거래대금':'관심도', 'Sharpe':'Risk1', 'Sortino':'Risk2', 'Position':'모멘텀'}, axis=1)
    t.index+=1
    t.index.name='순위'

    return t

In [None]:
# k
stocklist = StockList(365, [.98,1], [.99,1]) # 원래 250인데 365로 바꿈
t = stocklist.filtered_df
t = t.sort_values(['거래대금'], ascending=[False]).head(3)

In [None]:
# k trend

d = {}
d = stocklist.get_price_df(t['종목코드'].values)

merged_df = pd.DataFrame()
num_intervals = 15

num_intervals = 15

for key, df in d.items():
    increased_ratio = df['Close'] / df['Close'].iloc[0]
    interval_size = int((len(increased_ratio) - 1) / (num_intervals - 1))
    row_indices = list(range(0, len(increased_ratio), interval_size))
    row_indices[-1] = len(increased_ratio) - 1
    selected_ratio = increased_ratio.iloc[row_indices]
    merged_df[key] = selected_ratio

merged_df

Unnamed: 0_level_0,086520,247540,003670
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-08-04,1.0,1.0,1.0
2022-08-30,1.059569,0.875781,1.239852
2022-09-26,1.230465,0.721094,1.225092
2022-10-21,1.435545,0.83125,1.380074
2022-11-15,1.297857,0.910937,1.741697
2022-12-08,1.182619,0.808594,1.505535
2023-01-03,1.146143,0.728125,1.383764
2023-01-30,1.272822,0.807813,1.608856
2023-02-22,2.387799,1.253906,1.616236
2023-03-20,4.051717,1.59375,1.822878


In [None]:
from google.colab import files

merged_df.to_csv('king_trend_df.csv', encoding="utf-8-sig")
files.download('king_trend_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
t = t[['종목명','등락률','거래대금']].reset_index(drop=True)
t.index+=1
t.index.name='순위'
total_traded = t['거래대금'].sum()
t['거래대금'] = (t['거래대금'] / total_traded) * 100
t

Unnamed: 0_level_0,종목명,등락률,거래대금
순위,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,에코프로,1122.1,41.963304
2,에코프로비엠,198.43,32.137681
3,포스코퓨처엠,252.01,25.899015


In [None]:
from google.colab import files

t.to_csv('king_df.csv', encoding="utf-8-sig")
files.download('king_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Ai
stocklist = StockList()
t = stocklist.get_result_df(hashtags=True).head(15)
t

Unnamed: 0_level_0,종목명,현재가,파워,관심도,Risk1,Risk2,모멘텀,합산,Performance,Hashtags
순위,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,한화오션,46500,5.5,5.8,10.0,8.2,9.9,8.9,0,#잠수함 #국산화 #첫 #음향장비 #수주 #핵심 #성공 #개발 #호위함 #후
2,이오테크닉스,165200,5.3,5.0,9.2,10.0,9.9,8.9,0,#310억 #0.8% #↓ #JYP엔터 #기관 #외국인 #순 #윈텍 #엑시트 #나선다
3,HPSP,36700,5.5,6.8,8.1,7.2,10.0,8.3,0,#한미반도체 #반도체 #실적 #결정 #100억 #자사주 #투자 #기관 #외국인 #업황
4,칩스앤미디어,37850,5.5,5.0,8.0,6.8,9.0,7.3,0,#16억 #취득 #신탁계약 #감소 #AI #20억 #자사주 #확대 #2.9% #45...
5,ISC,92700,6.3,5.6,7.8,6.5,9.1,7.6,0,#SKC #반도체 #인수 #테스트 #솔루션 #소켓 #기업 #아이에스시(ISC) #아...
6,한미반도체,46950,9.6,10.0,7.5,6.7,9.2,10.0,0,#반도체 #HBM #AI #장비 #본더팩토리 #10대 #수요 #실적 #세계 #오픈
7,미래컴퍼니,46450,5.2,5.1,7.0,7.5,9.5,7.3,0,#공급 #디스플레이 #공급계약 #계약 #장비 #제조장비 #레보아이 #체결 #258억...
8,삼아알미늄,105900,8.7,6.0,6.9,6.4,8.4,7.9,0,#알루미늄박 #6951억 #LG에너지솔루션에 #계약 #2차전지 #공시 #LG엔솔 #...
9,레이크머티리얼즈,18970,10.0,9.3,6.7,5.9,8.1,9.1,0,#83억 #시설투자 #94억 #기관 #외국인 #7.7%↑ #전고체 #핵심 #생산 #...
10,DB,1976,5.5,5.2,6.7,6.3,7.6,6.3,0,#DB그룹 #DB하이텍 #보험 #금융 #급등 #개편 #인사 #DB김준기문화재단 #제...


In [None]:
t.to_csv('ai_df.csv', encoding="utf-8-sig")
files.download('ai_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# leading
# 2주/4주 간격 업데이트
exclude_list = t.head(3)['종목명'].values
leading_df = t[~t['종목명'].isin(exclude_list)]
data = {
    '날짜': ['2022', '2022', '2022'],
    '종목명': leading_df.sort_values('합산', ascending=False).head(3)['종목명'].values,
    '매수가': [0, 0, 0],
    '목표가': [0, 0, 0],
    '손절가': [0, 0, 0],
    '목표수익률': [0, 0, 0],
}
leading_df = pd.DataFrame(data)
leading_df

Unnamed: 0,날짜,종목명,매수가,목표가,손절가,목표수익률
0,,한미반도체,0,0,0,0
1,,레이크머티리얼즈,0,0,0,0
2,,삼아알미늄,0,0,0,0


In [None]:
leading_df.to_csv('leading_df.csv', encoding="utf-8-sig")
files.download('leading_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# performance
all_df = []
for p in [20, 60, 120]:
  stocklist = StockList(period=250, increased=[.9, .99], traded=[.9, .99], pre_period=p)
  t = stocklist.filtered_df
  t = t[['종목명', '종료일 종가', 'mdd', 'Performance', 'preperiod']].reset_index(drop=True)
  t.index+=1
  t.index.name='순위'
  all_df.append(t)
t = pd.concat(all_df)
t = t.groupby("preperiod").apply(lambda x: x.sort_values("순위", ascending=True).head(10))
t = t.sort_values('Performance', ascending=False)
t = t.drop_duplicates(subset=['종목명'], keep='first')
t = t.sample(frac=1).reset_index(drop=True)
t.index.name='순위'

In [None]:
t

Unnamed: 0_level_0,종목명,종료일 종가,mdd,Performance,preperiod
순위,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,이브이첨단소재,6600,0.2324,0.8152,20
1,지오릿에너지,13400,0.4266,0.6425,60
2,브리지텍,11100,0.4045,0.7252,120
3,덕양산업,8770,0.2611,0.74,20
4,DB,2100,0.1862,0.941,60
5,티로보틱스,17180,0.1634,1.6647,60
6,삼천당제약,84300,0.4903,0.7948,120
7,셀바스헬스케어,6290,0.1655,1.6057,20
8,삼부토건,4795,0.3014,0.7508,20
9,뉴로메카,39800,0.2852,0.902,60


In [None]:
t.to_csv('performance_df.csv', encoding="utf-8-sig")
files.download('performance_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# sentiments

!pip install newsapi-python
!pip install vaderSentiment

Collecting newsapi-python
  Downloading newsapi_python-0.2.7-py2.py3-none-any.whl (7.9 kB)
Installing collected packages: newsapi-python
Successfully installed newsapi-python-0.2.7
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [None]:
from newsapi import NewsApiClient
import datetime
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from wordcloud import WordCloud
import re

api_key = '6147ad35d10843b2949edc41cd955155'
api = NewsApiClient(api_key=api_key)

In [None]:
# business general

def get_headline_sentiment(category='business'):
    df = api.get_top_headlines(country='us', category=category, page_size=100)
    df = pd.DataFrame(df['articles'])
    df['date'] = pd.to_datetime(df['publishedAt'])
    df['date'] = df['date'].dt.floor('4H')

    analyzer = SentimentIntensityAnalyzer()

    # Vader Polarity
    df = pd.concat([df, pd.DataFrame([analyzer.polarity_scores(text) for text in df['title']])], axis=1)

    # TextBlob Subjectivity
    df['subjectivity'] = [TextBlob(text).sentiment.subjectivity for text in df['title']]
    df = df.fillna(0)
    return df['neg'].mean() * 100 + 50, df['pos'].mean() * 100 + 50, df['compound'].mean() * 100 + 50

def get_keyword_sentiment(keyword='nasdaq'):
  end_date = datetime.datetime.now().strftime('%Y-%m-%d')
  start_date = (datetime.datetime.now() - datetime.timedelta(days=3)).strftime('%Y-%m-%d')

  df = api.get_everything(q=keyword, language='en', page_size=100, from_param=start_date, to=end_date)
  df = pd.DataFrame(df['articles'])
  df['date'] = pd.to_datetime(df['publishedAt'])
  df['date'] = df['date'].dt.floor('4H')

  analyzer = SentimentIntensityAnalyzer()

  df = pd.concat([df, pd.DataFrame([analyzer.polarity_scores(text) for text in df['title']])], axis=1)

  df['subjectivity'] = [TextBlob(text).sentiment.subjectivity for text in df['title']]
  df = df.fillna(0)
  return df['neg'].mean() * 100 + 50, df['pos'].mean() * 100 + 50, df['compound'].mean() * 100 + 50


In [None]:
business_sentiment = get_headline_sentiment()
general_sentiment = get_headline_sentiment('general')
nasdaq_sentiment = get_keyword_sentiment('Nasdaq')
snp500_sentiment = get_keyword_sentiment('S&P 500')
dowjones_sentiment = get_keyword_sentiment('Dow Jones')

neg = (business_sentiment[0] + general_sentiment[0] + nasdaq_sentiment[0] + snp500_sentiment[0] + dowjones_sentiment[0]) / 5
pos = (business_sentiment[1] + general_sentiment[1] + nasdaq_sentiment[1] + snp500_sentiment[1] + dowjones_sentiment[1]) / 5
compound = (business_sentiment[2] + general_sentiment[2] + nasdaq_sentiment[2] + snp500_sentiment[2] + dowjones_sentiment[2]) / 5

# Optional: If you want to convert neg, pos, compound to integers, you can use round() or int()
neg = int(round(neg))
pos = int(round(pos))
compound = int(round(compound))

In [None]:
data = {'positive': [34, 42, 34, 33, 11, 77, pos],
        'negative': [12, 35, 12, 55, 11, 13, neg],
        'compound': [22, 10, 6, 10, 23, 84, compound]}

df = pd.DataFrame(data, index=['2022/11/11', '2022/11/12', '2022/11/13', '2022/11/14', '2022/11/15', '2022/11/16', '2022/11/17'])
df.index.name = 'date'
df

Unnamed: 0_level_0,positive,negative,compound
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022/11/11,34,12,22
2022/11/12,42,35,10
2022/11/13,34,12,6
2022/11/14,33,55,10
2022/11/15,11,11,23
2022/11/16,77,13,84
2022/11/17,59,56,55


In [None]:
df.to_csv('sentiment_df.csv', encoding="utf-8-sig")
files.download('sentiment_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [1]:
# word cloud
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.4.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (465 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.3/465.3 kB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.1 konlpy-0.6.0


In [2]:
from multiprocessing import Pool
from konlpy.tag import Hannanum
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime
import html

class News:
  def __init__(self):
    self.headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'}

  def get_ranking(self, page=1, date='1'):
    date = str(datetime.datetime.now().date().strftime("%Y%m%d")) if date == '1' else date
    pages = list(range(1,page+1))
    li = []
    for page in pages:
      url = 'https://finance.naver.com/news/news_list.nhn?mode=RANK&date=' + date + '&page=' + str(page)
      result = requests.get(url, headers = self.headers)
      bs_obj = BeautifulSoup(result.content, "html.parser")

      div = bs_obj.find_all('div', {'class':'hotNewsList'})
      li+=[html.unescape(x['href']) for x in div[0].find_all('a')]
    return li

  def get_article(self, url):
    url = 'https://finance.naver.com/' + url
    result = requests.get(url, headers = self.headers)
    bs_obj = BeautifulSoup(result.content, "html.parser")

    title = ' '.join(bs_obj.find_all('div', {'class': 'article_info'})[0].find_all('h3')[0].get_text().split())
    context = bs_obj.find_all('div', {'class': 'articleCont'})[0]
    try:
      subheading = context.find_all('strong')[0].get_text()
    except:
      subheading = ''
    t = ' '.join(context.get_text().split())
    text = t[:t.find('@')]
    text = text.replace(subheading, '')
    return title, subheading, text

news = News()
news_list = news.get_ranking(5)
with Pool() as p:
    li = p.map(news.get_article, news_list)
li = np.array(li)

dic = {'Title':li[:,0], 'Subheading':li[:,1], 'Text':li[:,2]}
df = pd.DataFrame.from_dict(dic)
df

Unnamed: 0,Title,Subheading,Text
0,불개미 vs 공매도…2차전지 뜨거운 고지전[최훈길의뒷담화],"지난달 코스피·코스닥 공매도 22조, 역대 최대2차전지 집중, 하락 베팅·줍줍·숏 ...",[이데일리 최훈길 기자] 최근에 서울 여의도 2번 출구 앞을 가보셨습니까. 2번 출...
1,"중학개미가 사랑한 배터리 공룡, 中 증시 제자리 걸음에도 '선방'",[마이마이 차이나],[편집자주] 중국은 가깝고도 먼 나라입니다. 수출 의존도가 높은 한국의 주요 교역국...
2,흑해곡물협정 파기에 세계 식량 가격 석달만에 상승,"FAO, 7월 가격지수 123.9…전월대비 1.3%↑밀 가격 9개월 만에 상승…美·...",서울 시내 한 대형할인마트에 식용유 상품이 진열돼 있다. ⓒ뉴시스[세종=데일리안 맹...
3,곳간지기의 배신…올해는 경남은행 560억,"[금융 핫앤뉴]초대형 횡령사건에 번지는 금융권 불신 IBK기업은행, 글로벌 ESG지...",그래픽=비즈워치횡령의 전모…PF 자금서 7년간 560억원 BNK금융지주 계열 BNK...
4,"롯데칠성음료도 대체유 시장 진출한다…""올해 안에 출시""",5년 새 국내 시장 33% 성장…약 6500억원 규모,롯데칠성음료 사옥 전경. (사진=롯데칠성음료 제공) *재판매 및 DB 금지[서울=뉴...
...,...,...,...
118,유리천장은 어디든 깨져야 한다 [편집인의 원픽],P.S. 취재한 이지민 기자에 물었습니다.,바닥에 깔린 많은 종이들 가운데 하나를 탁 집어 책상 위에 올려놓는 일. 흔히 언론...
119,문 '쾅쾅' 용인 흉기남 체포…살인 예고 글 10대도 검거,,【 앵커멘트 】 어젯밤 경기 용인시에서 흉기를 들고 배회하던 남성이 경찰에게 붙잡혔...
120,서민을 위한 감세는 없다,[이상민의 경제기사비평],"2023년 정부 세제개편안이 발표되었다. 서민, 실생활을 위한 핀셋감세에 방점이 찍..."
121,"애플, 실적 부진 전망에 10개월만에 최대 하락",,4일 4.8% 떨어져시총도 3조달러 밑으로3분기 매출 감소 전망 연합뉴스미국 IT ...


In [3]:
hannanum = Hannanum()

def clean_title(title):
    cleaned_title = title.replace(',', ' ') \
                         .replace('(', ' ') \
                         .replace(')', ' ') \
                         .replace('…', ' ') \
                         .replace('?', ' ') \
                         .replace('.', ' ') \
                         .replace("'", ' ') \
                         .replace('"', ' ') \
                         .replace('”', ' ') \
                         .replace('“', ' ') \
                         .replace('‘', ' ') \
                         .replace('’', ' ') \
                         .replace('·', ' ') \
                         .replace('[', ' ') \
                         .replace(']', ' ') \
                         .replace('↓', ' ') \
                         .replace('→', ' ') \
                         .replace('↑', ' ')
    return cleaned_title

def rank_words(words_list):
  word_ranking = {}
  for word in words_list:
      if word in word_ranking:
          word_ranking[word] += 1
      else:
          word_ranking[word] = 1
  sorted_word_ranking = dict(sorted(word_ranking.items(), key=lambda item: item[1], reverse=True))
  return sorted_word_ranking

def standardize_values(dictionary):
    max_value = max(dictionary.values())
    min_value = min(dictionary.values())

    for key in dictionary:
        value = dictionary[key]
        standardized_value = 10 + (value - min_value) * (90 / (max_value - min_value))
        dictionary[key] = round(standardized_value)

    return dictionary

In [4]:
df['Cleaned_Title'] = df['Title'].apply(clean_title)
df['Filtered_Pos'] = [hannanum.nouns(x) for x in df['Cleaned_Title']]
words = [word for sublist in df['Filtered_Pos'] for word in sublist]
word_frequency_dict = rank_words(words)
result = standardize_values(word_frequency_dict)



In [5]:
result

{'상승': 100,
 '등': 75,
 '가격': 67,
 '휘발유': 67,
 '3개월': 67,
 '물': 67,
 '2차전지': 59,
 '美': 59,
 '인도': 59,
 '흑해곡물협정': 51,
 '세계': 51,
 '애플': 51,
 '랩톱': 51,
 '수입제한': 51,
 '유예': 51,
 '하락': 51,
 '주식': 43,
 '4주': 43,
 '뉴욕증시': 43,
 '올해': 35,
 '부진': 35,
 '주': 35,
 '식량가격': 35,
 '기름값': 35,
 '영향': 35,
 '조치': 35,
 '예고': 35,
 '어디': 35,
 '불개미': 26,
 '배터리': 26,
 '증시': 26,
 '석달만': 26,
 '달': 26,
 '만': 26,
 '배터리아저씨': 26,
 '수익률': 26,
 '때': 26,
 '재점화': 26,
 '종료': 26,
 '명품': 26,
 '데': 26,
 '리터': 26,
 '11월': 26,
 '시행': 26,
 '중국': 26,
 '것': 26,
 '매출': 26,
 '조짐': 26,
 '8': 26,
 '리': 26,
 '전국': 26,
 '살': 26,
 '차익': 26,
 '실현': 26,
 '돌파': 26,
 '해외': 26,
 '전망': 26,
 '오늘': 26,
 '이곳': 26,
 '경유': 26,
 '급등': 26,
 '실적': 26,
 '공매': 18,
 '파': 18,
 '식량': 18,
 '회사': 18,
 '세': 18,
 '오름세': 18,
 '올드머니룩': 18,
 '중단': 18,
 '석달': 18,
 '공개': 18,
 '에코프': 18,
 '밧데리아저씨': 18,
 '얍삽한': 18,
 '생각': 18,
 '자이앤트TV': 18,
 '이유': 18,
 '40원': 18,
 '그리스': 18,
 '저': 18,
 '종목': 18,
 '소비': 18,
 '뒷북경제': 18,
 '4': 18,
 '시대': 18,
 '1일': 18,
 '종합': 18,
 '잼버

In [10]:
stop_words = ['의', '등', '때', '오늘', '이곳', '저', '월', '년', '일', '로', '어디', '데']
data_dict = {key: value for key, value in result.items() if key not in stop_words}
data_dict = {key: value for key, value in data_dict.items() if not str(key).isdigit()}

In [11]:
t = pd.DataFrame(data_dict, index=['value']).T.head(50)

Unnamed: 0,value
상승,100
가격,67
휘발유,67
3개월,67
물,67
2차전지,59
美,59
인도,59
흑해곡물협정,51
세계,51


In [12]:
from google.colab import files

t.to_csv('words_df.csv', header=False, encoding="utf-8-sig")
files.download('words_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>