<a href="https://colab.research.google.com/github/yeonghun00/stock_public/blob/main/issue_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime
from dateutil.relativedelta import *
import html
import re
import ast
from multiprocessing import Pool
from collections import Counter

In [2]:
class Issue_anlayser:
  def __init__(self, start_date, end_date):
    self.start_date = start_date
    self.end_date = end_date
    self.headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'}

    self.code_df = self.get_codes()
    self.issue_df = self.get_formatted(self.get_issues())
    self.freq_codes = self.get_freq_codes(self.issue_df)
  
  def get_codes(self):
    kospi_url = 'http://kind.krx.co.kr/corpgeneral/corpList.do?method=download&searchType=13&marketType=%sstockMkt'
    kosdaq_url = 'http://kind.krx.co.kr/corpgeneral/corpList.do?method=download&searchType=13&marketType=%skosdaqMkt'
    kospi_df = pd.read_html(kospi_url, header=0)[0]
    kosdaq_df = pd.read_html(kosdaq_url, header=0)[0]
    df = pd.concat([kospi_df, kosdaq_df], ignore_index=True)
    return df

  def get_issue(self, page):
    url = 'https://finance.naver.com/news/news_search.nhn?rcdate=&q=%C6%AF%C2%A1%C1%D6' +\
    '&sm=title.basic&pd=4&stDateStart=' + self.start_date + '&stDateEnd=' + self.end_date + '&page=' + str(page)
    result = requests.get(url, headers = self.headers)
    bs_obj = BeautifulSoup(result.content, "html.parser")
    text = [x.get_text() for x in bs_obj.find_all(re.compile('d*'), {'class':"articleSubject"})] 
    return text

  # 위와 다름 유의
  def get_issues(self):
    url = 'https://finance.naver.com/news/news_search.nhn?rcdate=&q=%C6%AF%C2%A1%C1%D6' +\
    '&sm=title.basic&pd=4&stDateStart=' + self.start_date + '&stDateEnd=' + self.end_date + '&page=1'
    result = requests.get(url, headers = self.headers)
    bs_obj = BeautifulSoup(result.content, "html.parser")
    href = bs_obj.find_all('td', {'class':'pgRR'})[0].find_all('a')[0]['href']
    href_index = href.find('page')
    end_page = int(href[href_index+5:]) + 1
    end_page = 201
    print(end_page)

    with Pool(1000) as p:
        li = p.map(self.get_issue, list(range(1, end_page)))
    return sum(li, [])

  def get_formatted(self, li):
    text_lis = [x.split() for x in li]
    word_lis = []

    for text_li in text_lis:
      for i in ['[',']', '·', '(', ')', '‘', '’','\'', '"', ',','...','…', '특징주']:
        text_li = [x.replace(i, ' ') for x in text_li]

      for j in ['↑', '↓']:
        text_li = [x.replace(j,(' '+j)) for x in text_li]
      word_lis.append((' '.join(text_li)).split())

    return word_lis

  def get_freq_codes(self, li):
    flatten_news_words = [j for sub in li for j in sub]
    news_words_frq = Counter(flatten_news_words).most_common()

    set_codes = set(self.code_df['회사명'].values)

    freq_codes = []
    for i in range(len([x[0] for x in news_words_frq])):
      if news_words_frq[i][0] in set_codes:
        freq_codes.append(news_words_frq[i])

    return freq_codes

In [3]:
def get_price(code, start:int='20180501', interval='day'):
    url ='https://api.finance.naver.com/siseJson.naver?symbol=' + code + ' &requestType=1&startTime=' \
    + start + '&endTime=' + str(datetime.datetime.now().date().strftime("%Y%m%d")) +  '&timeframe=' + interval

    result = requests.get(url, headers={'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'})
    bs_obj = BeautifulSoup(result.content, "html.parser")
    b = bs_obj.get_text()
    for i in ['\n', '\t', "\\", ' ']:
      b = b.replace(i,'')

    data = np.array(ast.literal_eval(b)[1:])

    dic = {'Date':list(map(lambda x: datetime.datetime.strptime(str(x), '%Y%m%d'), data[:,0])), \
      'Open':np.array(data[:,1], int), 'High':np.array(data[:,2], int), 'Low':np.array(data[:,3], int),\
       'Close':np.array(data[:,4], int), 'Volume':np.array(data[:,5], int)}

    df = pd.DataFrame(data=dic)
    df = df.set_index('Date')
    return df

def find_date(date, date_idx):
  date = date
  while date not in date_idx:
    date = date + datetime.timedelta(days=1)
  return date

def get_test_dates():
  li = []
  start = datetime.datetime(2016, 1, 1)
  end = datetime.datetime(2020, 1, 1)

  date = start
  while date != end:
    li.append(date)
    date += relativedelta(months=+1)

  return list(zip(li, [x+relativedelta(months=+1) for x in li]))

In [4]:
ia = Issue_anlayser('2000-01-01', '2021-06-14')
dic ={'text':ia.issue_df}
df = pd.DataFrame.from_dict(dic)

201


In [18]:
'''from google.colab import files
df.to_csv('news.csv', encoding='utf-8-sig')

files.download("news.csv")'''

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [5]:
df

Unnamed: 0,text
0,"[시총, 3, 4위, 네이버, 카카오, 엎치락뒤치락, 종합]"
1,"[화승코퍼레이션, 전기차, 업황, 호조, 기대감에, 급등세]"
2,"[일일, 확진자, 300명대, 리조트, 관련주, 서부T&D, 급등]"
3,"[우리손에프앤지, 돼지고기값, 폭등에, 상승, 6%, ↑]"
4,"[텔레칩스, 현대차, 반도체, 국산화, 추진, 소식에, 파운드리, 유력, 후보, 급부상]"
...,...
3995,"[윤석열, 검찰총장, 사퇴에, 테마주, 줄줄이, 상한가, 종합]"
3996,"[윤석열, 검찰총장직, 사의, 표명, 윤석열, 테마주, 급등]"
3997,"[SK케미칼, AZ, 백신, 접종, 후, 사망, 사례, 늘며, 8%대, 약세]"
3998,"[사퇴, 선언, 윤석열, 총장, 테마주, 급등세]"


In [30]:
from gensim.models import Word2Vec
model = Word2Vec(sentences = df['text'].values, size = 300, window = 10, min_count = 5, workers = 1, sg = 0)

In [31]:
model.wv.vectors.shape

(956, 300)

In [32]:
for i in (model.wv.most_similar("윤석열")):
  print(i)

('강세', 0.9999161958694458)
('급등', 0.9999116063117981)
('소식에', 0.9999104738235474)
('상승', 0.999908447265625)
('급락', 0.9999080896377563)
('부각', 0.9999035000801086)
('상장', 0.9999032020568848)
('상한가', 0.9999032020568848)
('↑', 0.9999027252197266)
('코로나19', 0.9999023675918579)


In [15]:
hold_period = 20
dic = {'date':[],'codes':[],'profit':[]}
for i in get_test_dates():
  print(i)
  dic['date'].append(i)
  start_date = i[0]
  end_date = i[1]
  result_date = end_date+datetime.timedelta(days=hold_period)

  ia = Issue_anlayser(start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d"))

  # format
  code_names = [x[0] for x in ia.freq_codes][:5]
  print(code_names)
  code_idx = list(map(lambda code: [x for x in ia.code_df['회사명']].index(code), code_names))
  codes = [str(x).rjust(6, '0') for x in ia.code_df['종목코드'][code_idx]]
  dic['codes'].append(codes)

  mean = []
  profits = []
  for code in codes:
    price_df = get_price(code, '20140501')
    idx = price_df.index
    start = find_date(start_date, idx)
    end = find_date(end_date, idx)
    result = find_date(result_date, idx)

    profit = price_df.loc[result]['Close'] / price_df.loc[end]['Close']
    profits.append(profit)
    print(code, ': ', profit)
    mean.append(price_df.loc[result]['Close'] / price_df.loc[end]['Close'])
  dic['profit'].append(profits)
  print(sum(mean) / len(mean))

(datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0))
201


Process ForkPoolWorker-2238:
Process ForkPoolWorker-2579:
Process ForkPoolWorker-2784:
Process ForkPoolWorker-2050:
Process ForkPoolWorker-2069:
Process ForkPoolWorker-2074:
Process ForkPoolWorker-2039:
Process ForkPoolWorker-2077:
Process ForkPoolWorker-2030:
Process ForkPoolWorker-2047:
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

Process ForkPoolWorker-2083:
Process ForkPoolWorker-2038:
Process ForkPoolWorker-2080:
Process ForkPoolWorker-2036:
Process ForkPoolWorker-2065:
Process ForkPoolWorker-2037:
Process ForkPoolWorker-2035:
Process ForkPoolWorker-2336:
Process ForkPoolWorker-2184:
Process ForkPoolWorker-2756:
Traceback (most recent call last):
Process ForkPoolWorker-2040:
Process ForkPoolWorker-2093:
Process ForkPoolWorker-2104:
Process ForkPoolWorker-2091:
Process ForkPoolWorker-2064:
Process ForkPoolWorker-2102:
Process ForkPoolWorker-2095:
Process ForkPoolWorker-2081:
Process ForkPoolWorker-2088:
Process ForkPoolWo

Traceback (most recent call last):
  File "<ipython-input-2-f2ec0b545fef>", line 38, in get_issues
    li = p.map(self.get_issue, list(range(1, end_page)))
  File "/usr/lib/python3.7/multiprocessing/pool.py", line 268, in map
    return self._map_async(func, iterable, mapstar, chunksize).get()
  File "/usr/lib/python3.7/multiprocessing/pool.py", line 651, in get
    self.wait(timeout)
  File "/usr/lib/python3.7/multiprocessing/pool.py", line 648, in wait
    self._event.wait(timeout)
  File "/usr/lib/python3.7/threading.py", line 552, in wait
    signaled = self._cond.wait(timeout)
  File "/usr/lib/python3.7/threading.py", line 296, in wait
    waiter.acquire()
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-15-7b57e14ccf27>", line 1

KeyboardInterrupt
Process ForkPoolWorker-2052:
Process ForkPoolWorker-2106:
  File "/usr/local/lib/python3.7/dist-packages/bs4/builder/__init__.py", line 163, in _replace_cdata_list_attribute_values
    for attr in list(attrs.keys()):
Traceback (most recent call last):
Process ForkPoolWorker-2033:
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "/usr/lib/python3.7/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
  File "<ipython-input-2-f2ec0b545fef>", line 22, in get_issue
    bs_obj = BeautifulSoup(result.content, "html.parser")
  File "/usr/local/lib/python3.7/dist-packages/bs4/__init__.py", line 343, in _feed
    self.builder.feed(self.markup)
  File "/usr/local/lib/python3.7/

TypeError: ignored

In [43]:
from scipy.stats.mstats import gmean
gmean([sum(x)/5 for x in pd.DataFrame.from_dict(dic)['profit'].values])

1.0179875166566366

In [None]:

# 한달, 20, 5개: 1.0179875166566366

In [8]:
li = ['PTC', '히터', '테슬러', '납품', '우리산업', '상한가', '도달', '30%↑']

In [11]:
s = ' '.join(li)
s

'PTC 히터 테슬러 납품 우리산업 상한가 도달 30%↑'

In [12]:
for i in s:
    for j in ['↑', '↓']:
      s = s.replace(j,(' '+j))

In [14]:
s.split()

['PTC', '히터', '테슬러', '납품', '우리산업', '상한가', '도달', '30%', '↑']