<a href="https://colab.research.google.com/github/yeonghun00/stock_public/blob/main/issue_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime
import html
import re
import ast
from collections import Counter

In [41]:
class Issue_anlayser:
  def __init__(self, start_date, end_date):
    self.start_date = start_date
    self.end_date = end_date

    self.code_df = self.get_codes()
    self.issue_df = self.get_formatted(self.get_issues())
    self.freq_codes = self.get_freq_codes(self.issue_df)
  
  def get_codes(self):
    kospi_url = 'http://kind.krx.co.kr/corpgeneral/corpList.do?method=download&searchType=13&marketType=%sstockMkt'
    kosdaq_url = 'http://kind.krx.co.kr/corpgeneral/corpList.do?method=download&searchType=13&marketType=%skosdaqMkt'
    kospi_df = pd.read_html(kospi_url, header=0)[0]
    kosdaq_df = pd.read_html(kosdaq_url, header=0)[0]
    df = pd.concat([kospi_df, kosdaq_df], ignore_index=True)
    return df

  def get_issues(self, keyword='%C6%AF%C2%A1%C1%D6'):
    self.start_date = str(datetime.datetime.now().date().strftime("%Y-%m-%d")) if self.start_date == '1' else self.start_date
    self.end_date = str(datetime.datetime.now().date().strftime("%Y-%m-%d")) if self.end_date == '1' else self.end_date

    li = []
    for page in range(1,10000):
      url = 'https://finance.naver.com/news/news_search.nhn?rcdate=&q=' + keyword +\
      '&sm=title.basic&pd=4&stDateStart=' + self.start_date + '&stDateEnd=' + self.end_date + '&page=' + str(page)
      result = requests.get(url, headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'})
      bs_obj = BeautifulSoup(result.content, "html.parser")
      text = [x.get_text() for x in bs_obj.find_all(re.compile('d*'), {'class':"articleSubject"})] 
      if not text:
        break
      li += text
    return li

  def get_formatted(self, li):
    text_lis = [x.split() for x in li]
    word_lis = []

    for text_li in text_lis:
      for i in ['[',']', '·', '(', ')', '‘', '’','\'', '"', ',','...','…', '특징주']:
        text_li = [x.replace(i, ' ') for x in text_li]
      word_lis.append((' '.join(text_li)).split())

    return word_lis

  def get_freq_codes(self, li):
    flatten_news_words = [j for sub in li for j in sub]
    news_words_frq = Counter(flatten_news_words).most_common()

    set_codes = set(self.code_df['회사명'].values)

    freq_codes = []
    for i in range(len([x[0] for x in news_words_frq])):
      if news_words_frq[i][0] in set_codes:
        freq_codes.append(news_words_frq[i])

    return freq_codes

In [53]:
def get_price(code, start:int='20180501', interval='day'):
    url ='https://api.finance.naver.com/siseJson.naver?symbol=' + code + ' &requestType=1&startTime=' \
    + start + '&endTime=' + str(datetime.datetime.now().date().strftime("%Y%m%d")) +  '&timeframe=' + interval

    result = requests.get(url, headers={'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'})
    bs_obj = BeautifulSoup(result.content, "html.parser")
    b = bs_obj.get_text()
    for i in ['\n', '\t', "\\", ' ']:
      b = b.replace(i,'')

    data = np.array(ast.literal_eval(b)[1:])

    dic = {'Date':list(map(lambda x: datetime.datetime.strptime(str(x), '%Y%m%d'), data[:,0])), \
      'Open':np.array(data[:,1], int), 'High':np.array(data[:,2], int), 'Low':np.array(data[:,3], int),\
       'Close':np.array(data[:,4], int), 'Volume':np.array(data[:,5], int)}

    df = pd.DataFrame(data=dic)
    df = df.set_index('Date')
    return df

In [None]:
# 위 다섯 종목, 1주 

for i in [#date]:
  ia = Issue_anlayser('2021-06-09', '2021-06-11')
  codes = ia.freq_codes[:5]

  for code in codes:
    get_price(code, '20180501').loc['2018-05-02']['Close']
