<a href="https://colab.research.google.com/github/yeonghun00/stock-notes/blob/main/useful/all_in_one.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install exchange_calendars

Collecting exchange_calendars
  Downloading exchange_calendars-4.2.8-py3-none-any.whl (191 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m191.4/191.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyluach (from exchange_calendars)
  Downloading pyluach-2.2.0-py3-none-any.whl (25 kB)
Collecting korean-lunar-calendar (from exchange_calendars)
  Downloading korean_lunar_calendar-0.3.1-py3-none-any.whl (9.0 kB)
Installing collected packages: korean-lunar-calendar, pyluach, exchange_calendars
Successfully installed exchange_calendars-4.2.8 korean-lunar-calendar-0.3.1 pyluach-2.2.0


In [2]:
!pip install finance-datareader

Collecting finance-datareader
  Downloading finance_datareader-0.9.50-py3-none-any.whl (19 kB)
Collecting requests-file (from finance-datareader)
  Downloading requests_file-1.5.1-py2.py3-none-any.whl (3.7 kB)
Installing collected packages: requests-file, finance-datareader
Successfully installed finance-datareader-0.9.50 requests-file-1.5.1


In [3]:
import pandas as pd
import numpy as np
import requests
import datetime
import exchange_calendars as ecals # 개장일만
from io import StringIO
import matplotlib.pyplot as plt
import FinanceDataReader as fdr
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import MinMaxScaler
import nltk
import requests
from bs4 import BeautifulSoup

In [4]:
XKRX = ecals.get_calendar("XKRX") # 한국 코드

250일 등락률, 거래대금 90-99: 범인매매

60일 등락률 50-100, 거래대금 10-50 : 조용히 오르는 애들 (내꺼)



In [35]:
class StockList():
  def __init__(self, period=250, increased=[.9, .99], traded=[.9, .99], pre_period=0):
    self.period = period
    self.increased = increased
    self.traded = traded
    self.pre_period = pre_period # 시작기점

    self.price_dic = {}

    self.start, self.today = self.get_date()
    self.df = self.get_stock_df()
    self.filtered_df = self.get_filtered_df()
    self.result_df = self.get_result_df()

  def get_date(self):
    today = datetime.date.today().strftime('%Y%m%d')
    if self.pre_period != 0:
      today = (datetime.date.today() - datetime.timedelta(days=self.pre_period)).strftime('%Y%m%d')
    start = (datetime.date.today() - datetime.timedelta(days=self.period)).strftime('%Y%m%d')

    if XKRX.is_session(today) == False:
      today = XKRX.previous_open(today).strftime('%Y%m%d')
    if XKRX.is_session(start) == False:
      start = XKRX.next_open(start).strftime('%Y%m%d')
    return start, today

  def get_stocks(self, market='STK'):
    data = {
      'mktId': market,
      'strtDd': self.start,
      'endDd': self.today,
      'money': '1',
      'adjStkPrc': '2',
      'adjStkPrc_check': 'Y',
      'share': '1',
      'csvxls_isNo': 'false',
      'name': 'fileDown',
      'url': 'dbms/MDC/STAT/standard/MDCSTAT01602'
    }
    gen_url = 'http://data.krx.co.kr/comm/fileDn/GenerateOTP/generate.cmd'
    gen_key = requests.post(gen_url, data=data)

    down_url = 'http://data.krx.co.kr/comm/fileDn/download_csv/download.cmd'
    r = requests.post(down_url, data={'code':gen_key.text})
    r.encoding = 'EUC-KR'
    return pd.read_csv(StringIO(r.text))

  def get_stock_df(self):
    return pd.concat([self.get_stocks(), self.get_stocks('KSQ')]).reset_index(drop=True)

  def get_filtered_df(self):
    traded_df = self.df[(self.df['거래대금'] < self.df['거래대금'].quantile(self.traded[1])) & (self.df['거래대금'] > self.df['거래대금'].quantile(self.traded[0]))]
    increased_df = self.df[(self.df['등락률'] > self.df['등락률'].quantile(self.increased[0])) & (self.df['등락률'] < self.df['등락률'].quantile(self.increased[1]))]
    selected = pd.Series(np.intersect1d(traded_df['종목명'].values, increased_df['종목명'].values))
    return self.df[self.df['종목명'].isin(selected)].sort_values('등락률', ascending=False).head(20)

  def get_sharpe(self, df):
    change = df['Change']+1
    return change.mean()/change.std()

  def get_sortino(self, df):
    change = df['Change']+1
    return change.mean()/(change[change<1]).std()

  def get_position(self, df):
    return df['Close'][-1]/df['Close'].max()

  def get_future_mdd(self, s):
    peak = s.iloc[0]
    max_drawdown = 0
    for price in s:
        if price > peak:
            peak = price
        drawdown = (peak - price) / peak
        if drawdown > max_drawdown:
            max_drawdown = drawdown
    return max_drawdown

  def get_hashtags(self, code):
    page = 5
    li = []
    for i in range(1, page):
      url = 'https://finance.naver.com/item/news_news.naver?code=' + code + '&page=' + str(i) + '&sm=title_entity_id.basic&clusterId='
      result = requests.get(url, headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'})
      bs_obj = BeautifulSoup(result.content, "html.parser")
      tds = bs_obj.find_all('td', {'class': 'title'})
      texts = [td.text.strip() for td in tds]
      li.extend([sentence.replace('[', ' ').replace(']', ' ').replace('…', ' ').replace('·', ' ').replace('"', ' ').replace('“', ' ').replace(',', ' ').replace('...', ' ').replace("'", ' ').replace('‘', ' ').replace('’', ' ') for sentence in texts])

    words = [word for sentence in li for word in sentence.split()]
    stop_words = ['%', '에', '전망', '수혜', '추가', '주가', '주식', '특징주', \
                  '코스닥', '코스피', '시총', '전년비', '1분기', '2분기', '3분기', \
                  '4분기', '전년比', '↑', '지난해', '영업익', '대비', '증가', '영업이익',\
                  '규모', '속', '주당', '매출', '/', '소', '전년', '1Q', '2Q', '3Q', \
                  '4Q', '관련', '데이터로', '보는', '증시', '영향', '등']
    words = [word for word in words if word not in stop_words]
    word_frequencies = nltk.FreqDist(words)
    keywords = word_frequencies.most_common(11)
    return ' '.join(['#'+x[0] for x in keywords[1:]])

  def get_price_df(self, codes):
    d = {}
    for i in codes:
      d[i] = fdr.DataReader(str(i), self.start, self.today)
    return d

  def get_result_df(self, hashtags=False):
    self.price_dic = self.get_price_df(self.filtered_df['종목코드'])

    if (self.pre_period != 0):
      future_dic = {}
      for i in self.filtered_df['종목코드']:
        future = (datetime.date.today() + datetime.timedelta(days=self.pre_period)).strftime('%Y%m%d')
        future_dic[i] = fdr.DataReader(str(i), self.today, future)
      self.filtered_df['mdd'] = [round(self.get_future_mdd(future_dic[x]['Close']), 4) for x in future_dic]
      self.filtered_df['Performance'] = [round((future_dic[x]['Close'][-1]/future_dic[x]['Close'][0]), 4) for x in future_dic]
      self.filtered_df['preperiod'] = [self.pre_period for x in future_dic]
    else:
      self.filtered_df['mdd'] = [0 for x in self.price_dic]
      self.filtered_df['Performance'] = [0 for x in self.price_dic]
      self.filtered_df['preperiod'] = [self.pre_period for x in self.price_dic]

    self.filtered_df['Sharpe'] = [self.get_sharpe(self.price_dic[x]) for x in self.price_dic]
    self.filtered_df['Sortino'] = [self.get_sortino(self.price_dic[x]) for x in self.price_dic]
    self.filtered_df['Position'] = [self.get_position(self.price_dic[x]) for x in self.price_dic]

    if hashtags == True:
      self.filtered_df['Hashtags'] = [self.get_hashtags(x) for x in self.price_dic.keys()]
    else:
      self.filtered_df['Hashtags'] = [0 for x in self.price_dic.keys()]

    t = self.filtered_df.sort_values('Sharpe', ascending=False)

    scaler = MinMaxScaler(feature_range=(5, 10))

    t['Position'] = scaler.fit_transform(t[['Position']]).round(1)
    t['Sortino'] = scaler.fit_transform(t[['Sortino']]).round(1)
    t['Sharpe'] = scaler.fit_transform(t[['Sharpe']]).round(1)
    t['거래대금'] = scaler.fit_transform(t[['거래대금']]).round(1)
    t['등락률'] = scaler.fit_transform(t[['등락률']]).round(1)
    t['합산'] = t['Position'] + t['Sortino'] + t['Sharpe'] + t['거래대금'] + t['등락률']
    t['합산'] = scaler.fit_transform(t[['합산']]).round(1)
    t['preperiod'] = t[['preperiod']]
    t['Performance'] = t[['Performance']]
    t['mdd'] = t[['mdd']]

    t = t[['종목명', '종료일 종가', '등락률', '거래대금',	'Sharpe',	'Sortino',	'Position', '합산', 'Performance', 'Hashtags', 'preperiod', 'mdd']].reset_index(drop=True).rename({'종료일 종가':'현재가', '등락률':'파워', '거래대금':'관심도', 'Sharpe':'Risk1', 'Sortino':'Risk2', 'Position':'모멘텀'}, axis=1)
    t.index+=1
    t.index.name='순위'

    return t

In [None]:
# k
stocklist = StockList(365, [.98,1], [.99,1]) # 원래 250인데 365로 바꿈
t = stocklist.filtered_df
t = t.sort_values(['거래대금'], ascending=[False]).head(3)

In [None]:
# k trend

d = {}
d = stocklist.get_price_df(t['종목코드'].values)

merged_df = pd.DataFrame()
num_intervals = 15

num_intervals = 15

for key, df in d.items():
    increased_ratio = df['Close'] / df['Close'].iloc[0]
    interval_size = int((len(increased_ratio) - 1) / (num_intervals - 1))
    row_indices = list(range(0, len(increased_ratio), interval_size))
    row_indices[-1] = len(increased_ratio) - 1
    selected_ratio = increased_ratio.iloc[row_indices]
    merged_df[key] = selected_ratio

merged_df

Unnamed: 0_level_0,086520,247540,003670
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-08-04,1.0,1.0,1.0
2022-08-30,1.059569,0.875781,1.239852
2022-09-26,1.230465,0.721094,1.225092
2022-10-21,1.435545,0.83125,1.380074
2022-11-15,1.297857,0.910937,1.741697
2022-12-08,1.182619,0.808594,1.505535
2023-01-03,1.146143,0.728125,1.383764
2023-01-30,1.272822,0.807813,1.608856
2023-02-22,2.387799,1.253906,1.616236
2023-03-20,4.051717,1.59375,1.822878


In [None]:
from google.colab import files

merged_df.to_csv('king_trend_df.csv', encoding="utf-8-sig")
files.download('king_trend_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
t = t[['종목명','등락률','거래대금']].reset_index(drop=True)
t.index+=1
t.index.name='순위'
total_traded = t['거래대금'].sum()
t['거래대금'] = (t['거래대금'] / total_traded) * 100
t

Unnamed: 0_level_0,종목명,등락률,거래대금
순위,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,에코프로,1122.1,41.963304
2,에코프로비엠,198.43,32.137681
3,포스코퓨처엠,252.01,25.899015


In [None]:
from google.colab import files

t.to_csv('king_df.csv', encoding="utf-8-sig")
files.download('king_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [36]:
# Ai
stocklist = StockList()
t = stocklist.get_result_df(hashtags=True).head(15)
t = t.drop(['preperiod', 'Performance', 'mdd'], axis=1)
t

Unnamed: 0_level_0,종목명,현재가,파워,관심도,Risk1,Risk2,모멘텀,합산,Hashtags
순위,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,이오테크닉스,165000,5.8,5.3,10.0,10.0,9.7,10.0,#이번주 #추천주 #HBM #유커 #귀환주도 #주목 #JYP엔터 #기관 #외국인 #순
2,ISC,98400,8.0,5.9,8.3,6.8,9.4,9.1,#SKC #인수 #반도체 #테스트 #솔루션 #소켓 #기업 #아이에스시(ISC) #아...
3,가온칩스,42550,5.8,6.4,8.2,7.2,8.8,8.4,#ARM #인수 #삼성전자 #반도체 #계약 #가능성에 #강세 #ASIC #설계 #설계개발
4,큐렉소,20250,7.7,6.0,8.2,7.7,9.6,9.4,#의료로봇 #인공관절 #로봇 #수술로봇 #판매 #체결 #뉴로메카 #계약 #美 #11억
5,실리콘투,8300,7.8,6.3,7.3,7.1,9.4,9.0,#유럽 #소식에 #단기차입금 #글로벌이 #주목하는 #K-뷰티 #수익성 #안정성↑-한...
6,삼아알미늄,103100,7.9,6.1,7.2,6.9,8.3,8.4,#알루미늄박 #6951억 #LG에너지솔루션에 #계약 #2차전지 #공시 #LG엔솔 #...
7,DB,1832,5.0,5.3,7.0,6.9,7.4,6.7,#DB메탈 #흡수합병 #DB하이텍 #급등 #DB그룹 #KCGI #DB아이엔씨 #DB...
8,레이크머티리얼즈,16010,8.9,10.0,7.0,6.3,7.4,9.6,#83억 #전고체 #핵심 #기관 #외국인 #7.7%↑ #생산 #194억원 #부지 #매입
9,덕양산업,5810,5.2,5.6,6.6,6.4,6.8,6.4,#수상 #배터리 #개발 #유공 #장관 #표창 #소재 #ESG #경영 #실천
10,삼부토건,3305,5.3,8.8,6.6,7.1,7.6,8.1,#재건 #우크라이나 #우크라 #디와이디 #민간임대주택 #재건사업 #강세 #참여 #신...


In [None]:
t.to_csv('ai_df.csv', encoding="utf-8-sig")
files.download('ai_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# leading
# 2주/4주 간격 업데이트
exclude_list = t.head(3)['종목명'].values
leading_df = t[~t['종목명'].isin(exclude_list)]
data = {
    '날짜': ['2022', '2022', '2022'],
    '종목명': leading_df.sort_values('합산', ascending=False).head(3)['종목명'].values,
    '매수가': [0, 0, 0],
    '목표가': [0, 0, 0],
    '손절가': [0, 0, 0],
    '목표수익률': [0, 0, 0],
}
leading_df = pd.DataFrame(data)
leading_df

Unnamed: 0,날짜,종목명,매수가,목표가,손절가,목표수익률
0,,한미반도체,0,0,0,0
1,,레이크머티리얼즈,0,0,0,0
2,,삼아알미늄,0,0,0,0


In [None]:
leading_df.to_csv('leading_df.csv', encoding="utf-8-sig")
files.download('leading_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [44]:
# performance
all_df = []
for p in [20, 60, 120]:
  stocklist = StockList(period=250, increased=[.9, .99], traded=[.9, .99], pre_period=p)
  t = stocklist.get_result_df(hashtags=False).head(15)
  t = t[['종목명', '현재가', 'mdd', 'Performance', 'preperiod']].reset_index(drop=True)
  all_df.append(t)
t = pd.concat(all_df)
t = t.reset_index(drop=True)
t.index.name='순위'
t.index+=1
t

In [None]:
t.to_csv('performance_df.csv', encoding="utf-8-sig")
files.download('performance_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# sentiments

!pip install newsapi-python
!pip install vaderSentiment

Collecting newsapi-python
  Downloading newsapi_python-0.2.7-py2.py3-none-any.whl (7.9 kB)
Installing collected packages: newsapi-python
Successfully installed newsapi-python-0.2.7
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [None]:
from newsapi import NewsApiClient
import datetime
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from wordcloud import WordCloud
import re

api_key = '6147ad35d10843b2949edc41cd955155'
api = NewsApiClient(api_key=api_key)

In [None]:
# business general

def get_headline_sentiment(category='business'):
    df = api.get_top_headlines(country='us', category=category, page_size=100)
    df = pd.DataFrame(df['articles'])
    df['date'] = pd.to_datetime(df['publishedAt'])
    df['date'] = df['date'].dt.floor('4H')

    analyzer = SentimentIntensityAnalyzer()

    # Vader Polarity
    df = pd.concat([df, pd.DataFrame([analyzer.polarity_scores(text) for text in df['title']])], axis=1)

    # TextBlob Subjectivity
    df['subjectivity'] = [TextBlob(text).sentiment.subjectivity for text in df['title']]
    df = df.fillna(0)
    return df['neg'].mean() * 100 + 50, df['pos'].mean() * 100 + 50, df['compound'].mean() * 100 + 50

def get_keyword_sentiment(keyword='nasdaq'):
  end_date = datetime.datetime.now().strftime('%Y-%m-%d')
  start_date = (datetime.datetime.now() - datetime.timedelta(days=3)).strftime('%Y-%m-%d')

  df = api.get_everything(q=keyword, language='en', page_size=100, from_param=start_date, to=end_date)
  df = pd.DataFrame(df['articles'])
  df['date'] = pd.to_datetime(df['publishedAt'])
  df['date'] = df['date'].dt.floor('4H')

  analyzer = SentimentIntensityAnalyzer()

  df = pd.concat([df, pd.DataFrame([analyzer.polarity_scores(text) for text in df['title']])], axis=1)

  df['subjectivity'] = [TextBlob(text).sentiment.subjectivity for text in df['title']]
  df = df.fillna(0)
  return df['neg'].mean() * 100 + 50, df['pos'].mean() * 100 + 50, df['compound'].mean() * 100 + 50


In [None]:
business_sentiment = get_headline_sentiment()
general_sentiment = get_headline_sentiment('general')
nasdaq_sentiment = get_keyword_sentiment('Nasdaq')
snp500_sentiment = get_keyword_sentiment('S&P 500')
dowjones_sentiment = get_keyword_sentiment('Dow Jones')

neg = (business_sentiment[0] + general_sentiment[0] + nasdaq_sentiment[0] + snp500_sentiment[0] + dowjones_sentiment[0]) / 5
pos = (business_sentiment[1] + general_sentiment[1] + nasdaq_sentiment[1] + snp500_sentiment[1] + dowjones_sentiment[1]) / 5
compound = (business_sentiment[2] + general_sentiment[2] + nasdaq_sentiment[2] + snp500_sentiment[2] + dowjones_sentiment[2]) / 5

# Optional: If you want to convert neg, pos, compound to integers, you can use round() or int()
neg = int(round(neg))
pos = int(round(pos))
compound = int(round(compound))

In [None]:
data = {'positive': [34, 42, 34, 33, 11, 77, pos],
        'negative': [12, 35, 12, 55, 11, 13, neg],
        'compound': [22, 10, 6, 10, 23, 84, compound]}

df = pd.DataFrame(data, index=['2022/11/11', '2022/11/12', '2022/11/13', '2022/11/14', '2022/11/15', '2022/11/16', '2022/11/17'])
df.index.name = 'date'
df

Unnamed: 0_level_0,positive,negative,compound
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022/11/11,34,12,22
2022/11/12,42,35,10
2022/11/13,34,12,6
2022/11/14,33,55,10
2022/11/15,11,11,23
2022/11/16,77,13,84
2022/11/17,59,56,55


In [None]:
df.to_csv('sentiment_df.csv', encoding="utf-8-sig")
files.download('sentiment_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# word cloud
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.4.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (465 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.3/465.3 kB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.1 konlpy-0.6.0


In [None]:
from multiprocessing import Pool
from konlpy.tag import Hannanum
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime
import html

class News:
  def __init__(self):
    self.headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'}

  def get_ranking(self, page=1, date='1'):
    date = str(datetime.datetime.now().date().strftime("%Y%m%d")) if date == '1' else date
    pages = list(range(1,page+1))
    li = []
    for page in pages:
      url = 'https://finance.naver.com/news/news_list.nhn?mode=RANK&date=' + date + '&page=' + str(page)
      result = requests.get(url, headers = self.headers)
      bs_obj = BeautifulSoup(result.content, "html.parser")

      div = bs_obj.find_all('div', {'class':'hotNewsList'})
      li+=[html.unescape(x['href']) for x in div[0].find_all('a')]
    return li

  def get_article(self, url):
    url = 'https://finance.naver.com/' + url
    result = requests.get(url, headers = self.headers)
    bs_obj = BeautifulSoup(result.content, "html.parser")

    title = ' '.join(bs_obj.find_all('div', {'class': 'article_info'})[0].find_all('h3')[0].get_text().split())
    context = bs_obj.find_all('div', {'class': 'articleCont'})[0]
    try:
      subheading = context.find_all('strong')[0].get_text()
    except:
      subheading = ''
    t = ' '.join(context.get_text().split())
    text = t[:t.find('@')]
    text = text.replace(subheading, '')
    return title, subheading, text

news = News()
news_list = news.get_ranking(5)
with Pool() as p:
    li = p.map(news.get_article, news_list)
li = np.array(li)

dic = {'Title':li[:,0], 'Subheading':li[:,1], 'Text':li[:,2]}
df = pd.DataFrame.from_dict(dic)
df

Unnamed: 0,Title,Subheading,Text
0,휴가철도 막바지…'백캉스'로 빵지순례 다녀볼까,[주말쇼핑포인트]롯데 개성주악·더현대 뵈르뵈르 아이스크림신세계 가루쌀 빵 팝업…집객...,전통 개성주악 디저트 '세화연' 팝업 이미지(롯데백화점 제공)(서울=뉴스1) 서미선...
1,중국인 단체관광길 열렸지만…과거 영광 되찾기는 ‘글쎄’ [언박싱],유커의 귀환 ②,중국인의 한국행 단체관광이 전면 허용되면서 중국 관광객(유커·遊客)이 속속 입국하고...
2,나흘간 6500억 판 아트페어 또 열린다…미술시장 뛰어드는 유통가,"세계 3대 아트페어 프리즈, 오는 9월 두 번째 서울 행사국내 미술시장, 지난해 1...",지난해 9월 코엑스에서 열린 아트페어 '프리즈 서울'을 찾은 관람객들이 전시장 입장...
3,"디폴트 위기의 中 비구이위안, 홍콩 항셍지수서 제외",,중국 대형 부동산 개발업체 비구이위안(碧桂園·컨트리가든)이 채무불이행(디폴트) 위기...
4,나스닥 4일 연속 하락…휴가시즌 낙관론에 대한 의문 [뉴욕마감],,뉴욕 타임스퀘어 나스닥 마켓플레이스뉴욕증시에서 3대 지수가 이번주 마지막 거래일을 ...
...,...,...,...
118,"""너무 줄였나"" 증권사, 지점 감축 바람… 삼성 3년새 '반토막'",,국내 증권사들이 오프라인 지점을 줄이고 온라인 채널 확대에 나섰다. 사진은 서울 여...
119,보험권 '50년 만기 주담대' 시대 열리나… 삼성화재·생명도 합류,,대형 보험사들이 50년 만기 주담대를 속속 출시하며 보험권에 전반적으로 확산될지 이...
120,'헝다 위기감 고조' 中 펀드도 울상… 한달새 4000억 빠졌다,,중국 전에 있는 중국 에버그란데 그룹 본사./사진=로이터 국내 중국 주식형 펀드에서...
121,"7000만원대 GV80, 카드 할부로 살까… 신한 vs KB국민, 車할부 키운다",,사진=이미지투데이 신차 할부금융 시장에 카드사들이 드라이브를 걸고 있다. 카드사들은...


In [None]:
import random
import re

hannanum = Hannanum()

def clean_title(title):
    cleaned_title = title.replace(',', ' ') \
                         .replace('(', ' ') \
                         .replace(')', ' ') \
                         .replace('…', ' ') \
                         .replace('?', ' ') \
                         .replace('.', ' ') \
                         .replace("'", ' ') \
                         .replace('"', ' ') \
                         .replace('”', ' ') \
                         .replace('“', ' ') \
                         .replace('‘', ' ') \
                         .replace('’', ' ') \
                         .replace('·', ' ') \
                         .replace('[', ' ') \
                         .replace(']', ' ') \
                         .replace('↓', ' ') \
                         .replace('→', ' ') \
                         .replace('↑', ' ')
    return cleaned_title

def rank_words(words_list):
  word_ranking = {}
  for word in words_list:
      if word in word_ranking:
          word_ranking[word] += 1
      else:
          word_ranking[word] = 1
  sorted_word_ranking = dict(sorted(word_ranking.items(), key=lambda item: item[1], reverse=True))
  return sorted_word_ranking

def standardize_values(dictionary):
    max_value = max(dictionary.values())
    min_value = min(dictionary.values())

    for key in dictionary:
        value = dictionary[key]
        standardized_value = 10 + (value - min_value) * (90 / (max_value - min_value))
        dictionary[key] = round(standardized_value)

    return dictionary

def custom_standardize_values(dictionary, new_min, new_max, noise_min, noise_max):
    max_value = max(dictionary.values())
    min_value = min(dictionary.values())

    standardized_dict = {}
    for key, value in dictionary.items():
        # Linear transformation to scale the values
        standardized_value = new_min + (value - min_value) * (new_max - new_min) / (max_value - min_value)

        # Add random noise
        noise = random.uniform(noise_min, noise_max)
        standardized_value_with_noise = standardized_value + noise

        standardized_dict[key] = round(standardized_value_with_noise, 0)

    return standardized_dict

def extract_nouns(text, custom_nouns_list):
    words = re.findall(r'\b\w+\b', text)

    filtered_nouns = {}
    for word in words:
        if word in custom_nouns_list:
            filtered_nouns[word] = filtered_nouns.get(word, 0) + 1

    return filtered_nouns

In [None]:
df['Cleaned_Title'] = df['Title'].apply(clean_title)
df['Filtered_Pos'] = [hannanum.nouns(x) for x in df['Cleaned_Title']]
words = [word for sublist in df['Filtered_Pos'] for word in sublist]
word_frequency_dict = rank_words(words)
result = standardize_values(word_frequency_dict)

In [None]:
stop_words = ['의', '등', '때', '오늘', '이곳', '저', '월', '년', '일', '로', '어디', '데', '특징주', '장중', '전', '주', '위', '한', '수',' 속', '전', '추', '속', '내', '비트코']
data_dict = {key: value for key, value in result.items() if key not in stop_words}
data_dict = {key: value for key, value in data_dict.items() if not str(key).isdigit()}

In [None]:
#설정 종목명

title_as_sentence = ' '.join(df['Cleaned_Title'].values)
custom_nouns = stocklist.df['종목명'].values.tolist() + ['비트코인', '이더리움']
filtered_nouns = extract_nouns(title_as_sentence, custom_nouns)
ticker_dict = custom_standardize_values(filtered_nouns, 40, 80, 5, 15)

In [None]:
data_dict.update(ticker_dict)

In [None]:
t = pd.DataFrame(data_dict, index=['value']).T.sort_values(by='value', ascending=False).head(50)

In [None]:
t

Unnamed: 0,value
中,100.0
비트코인,87.0
뉴욕증시,68.0
셀트리온,65.0
부동산,61.0
SK텔레콤,55.0
이마트,55.0
삼성화재,55.0
레인보우로보틱스,54.0
네패스아크,54.0


In [None]:
from google.colab import files

t.to_csv('words_df.csv', header=False, encoding="utf-8-sig")
files.download('words_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>