<a href="https://colab.research.google.com/github/yeonghun00/stock_public/blob/main/issue_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [140]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime
import html
import re
import ast
from multiprocessing import Pool
from collections import Counter

In [168]:
class Issue_anlayser:
  def __init__(self, start_date, end_date):
    self.start_date = start_date
    self.end_date = end_date
    self.headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'}

    self.code_df = self.get_codes()
    self.issue_df = self.get_formatted(self.get_issues())
    self.freq_codes = self.get_freq_codes(self.issue_df)
  
  def get_codes(self):
    kospi_url = 'http://kind.krx.co.kr/corpgeneral/corpList.do?method=download&searchType=13&marketType=%sstockMkt'
    kosdaq_url = 'http://kind.krx.co.kr/corpgeneral/corpList.do?method=download&searchType=13&marketType=%skosdaqMkt'
    kospi_df = pd.read_html(kospi_url, header=0)[0]
    kosdaq_df = pd.read_html(kosdaq_url, header=0)[0]
    df = pd.concat([kospi_df, kosdaq_df], ignore_index=True)
    return df

  def get_issue(self, page):
    self.start_date = str(datetime.datetime.now().date().strftime("%Y-%m-%d")) if self.start_date == '1' else self.start_date
    self.end_date = str(datetime.datetime.now().date().strftime("%Y-%m-%d")) if self.end_date == '1' else self.end_date

    url = 'https://finance.naver.com/news/news_search.nhn?rcdate=&q=%C6%AF%C2%A1%C1%D6' +\
    '&sm=title.basic&pd=4&stDateStart=' + self.start_date + '&stDateEnd=' + self.end_date + '&page=' + str(page)
    result = requests.get(url, headers = self.headers)
    bs_obj = BeautifulSoup(result.content, "html.parser")
    text = [x.get_text() for x in bs_obj.find_all(re.compile('d*'), {'class':"articleSubject"})] 
    return text

  # 위와 다름 유의
  def get_issues(self):
    url = 'https://finance.naver.com/news/news_search.nhn?rcdate=&q=%C6%AF%C2%A1%C1%D6' +\
    '&sm=title.basic&pd=4&stDateStart=' + self.start_date + '&stDateEnd=' + self.end_date + '&page=1'
    result = requests.get(url, headers = self.headers)
    bs_obj = BeautifulSoup(result.content, "html.parser")
    href = bs_obj.find_all('td', {'class':'pgRR'})[0].find_all('a')[0]['href']
    href_index = href.find('page')
    end_page = int(href[href_index+5])

    with Pool(100) as p:
        li = p.map(self.get_issue, list(range(1, end_page)))
    return sum(li, [])

  def get_formatted(self, li):
    text_lis = [x.split() for x in li]
    word_lis = []

    for text_li in text_lis:
      for i in ['[',']', '·', '(', ')', '‘', '’','\'', '"', ',','...','…', '특징주']:
        text_li = [x.replace(i, ' ') for x in text_li]
      word_lis.append((' '.join(text_li)).split())

    return word_lis

  def get_freq_codes(self, li):
    flatten_news_words = [j for sub in li for j in sub]
    news_words_frq = Counter(flatten_news_words).most_common()

    set_codes = set(self.code_df['회사명'].values)

    freq_codes = []
    for i in range(len([x[0] for x in news_words_frq])):
      if news_words_frq[i][0] in set_codes:
        freq_codes.append(news_words_frq[i])

    return freq_codes

In [169]:
def get_price(code, start:int='20180501', interval='day'):
    url ='https://api.finance.naver.com/siseJson.naver?symbol=' + code + ' &requestType=1&startTime=' \
    + start + '&endTime=' + str(datetime.datetime.now().date().strftime("%Y%m%d")) +  '&timeframe=' + interval

    result = requests.get(url, headers={'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'})
    bs_obj = BeautifulSoup(result.content, "html.parser")
    b = bs_obj.get_text()
    for i in ['\n', '\t', "\\", ' ']:
      b = b.replace(i,'')

    data = np.array(ast.literal_eval(b)[1:])

    dic = {'Date':list(map(lambda x: datetime.datetime.strptime(str(x), '%Y%m%d'), data[:,0])), \
      'Open':np.array(data[:,1], int), 'High':np.array(data[:,2], int), 'Low':np.array(data[:,3], int),\
       'Close':np.array(data[:,4], int), 'Volume':np.array(data[:,5], int)}

    df = pd.DataFrame(data=dic)
    df = df.set_index('Date')
    return df

In [170]:
class Backtester:
  def __init__(self, start_date, search_period, hold_period, loop_num):
    self.hold_period = 20

  def find_date(self, date, date_idx):
    date = date
    while date not in date_idx:
      date = date + datetime.timedelta(days=1)
    return date

  def format_to_codes(self, code_names):
    code_names = [x[0] for x in ia.freq_codes][:10]
    code_idx = list(map(lambda code: [x for x in ia.code_df['회사명']].index(code), code_names))
    codes = [str(x).rjust(6, '0') for x in ia.code_df['종목코드'][code_idx]]
    return codes
    
  def test(self, start_date, end_date):
    start_date = datetime.datetime(2020, 8, 1)
    end_date = datetime.datetime(2020, 9, 1)
    result_date = end_date+datetime.timedelta(days=hold_period)

    for code in codes:
      li = []
      idx = get_price(code, '20180501').index
      start = find_date(start_date, idx)
      end = find_date(end_date, idx)
      result = find_date(result_date, idx)
      li.append(tuple(code, get_price(code, '20180501').loc[result]['Close'] / get_price(code, '20180501').loc[end]['Close']))
    return li

In [184]:
ia = Issue_anlayser('2021-03-01', '2021-04-01')

In [185]:
def find_date(date, date_idx):
  date = date
  while date not in date_idx:
    date = date + datetime.timedelta(days=1)
  return date

In [186]:
# name to formatted code
code_names = [x[0] for x in ia.freq_codes][:10]
code_idx = list(map(lambda code: [x for x in ia.code_df['회사명']].index(code), code_names))
codes = [str(x).rjust(6, '0') for x in ia.code_df['종목코드'][code_idx]]
codes

['096770',
 '000660',
 '066570',
 '333620',
 '005880',
 '287410',
 '003550',
 '005360',
 '263750',
 '003530']

In [188]:
hold_period = 20
start_date = datetime.datetime(2021, 3, 1)
end_date = datetime.datetime(2021, 4, 1)
result_date = end_date+datetime.timedelta(days=hold_period)

mean = []
for code in codes:
  idx = get_price(code, '20180501').index
  start = find_date(start_date, idx)
  end = find_date(end_date, idx)
  result = find_date(result_date, idx)

  print(code, ': ', get_price(code, '20180501').loc[result]['Close'] / get_price(code, '20180501').loc[end]['Close'])
  mean.append(get_price(code, '20180501').loc[result]['Close'] / get_price(code, '20180501').loc[end]['Close'])

096770 :  1.093167701863354
000660 :  0.9430604982206405
066570 :  1.0440251572327044
333620 :  0.839171974522293
005880 :  0.9738675958188153
287410 :  1.3112582781456954
003550 :  1.193859567758093
005360 :  1.049645390070922
263750 :  0.9894867037724181
003530 :  1.2079646017699115


In [189]:
sum(mean) / len(mean)

1.0645507469174844