<a href="https://colab.research.google.com/github/yeonghun00/stock_public/blob/main/issue_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime
from dateutil.relativedelta import *
import html
import re
import ast
from multiprocessing import Pool
from collections import Counter

In [57]:
class Issue_anlayser:
  def __init__(self, start_date, end_date):
    self.start_date = start_date
    self.end_date = end_date
    self.headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'}

    self.code_df = self.get_codes()
    self.issue_df = self.get_formatted(self.get_issues())
    self.freq_codes = self.get_freq_codes(self.issue_df)
  
  def get_codes(self):
    kospi_url = 'http://kind.krx.co.kr/corpgeneral/corpList.do?method=download&searchType=13&marketType=%sstockMkt'
    kosdaq_url = 'http://kind.krx.co.kr/corpgeneral/corpList.do?method=download&searchType=13&marketType=%skosdaqMkt'
    kospi_df = pd.read_html(kospi_url, header=0)[0]
    kosdaq_df = pd.read_html(kosdaq_url, header=0)[0]
    df = pd.concat([kospi_df, kosdaq_df], ignore_index=True)
    return df

  def get_issue(self, page):
    url = 'https://finance.naver.com/news/news_search.nhn?rcdate=&q=%C6%AF%C2%A1%C1%D6' +\
    '&sm=title.basic&pd=4&stDateStart=' + self.start_date + '&stDateEnd=' + self.end_date + '&page=' + str(page)
    result = requests.get(url, headers = self.headers)
    bs_obj = BeautifulSoup(result.content, "html.parser")
    text = [x.get_text() for x in bs_obj.find_all(re.compile('d*'), {'class':"articleSubject"})] 
    return text

  # 위와 다름 유의
  def get_issues(self):
    url = 'https://finance.naver.com/news/news_search.nhn?rcdate=&q=%C6%AF%C2%A1%C1%D6' +\
    '&sm=title.basic&pd=4&stDateStart=' + self.start_date + '&stDateEnd=' + self.end_date + '&page=1'
    result = requests.get(url, headers = self.headers)
    bs_obj = BeautifulSoup(result.content, "html.parser")
    href = bs_obj.find_all('td', {'class':'pgRR'})[0].find_all('a')[0]['href']
    href_index = href.find('page')
    end_page = int(href[href_index+5])

    with Pool(100) as p:
        li = p.map(self.get_issue, list(range(1, end_page)))
    return sum(li, [])

  def get_formatted(self, li):
    text_lis = [x.split() for x in li]
    word_lis = []

    for text_li in text_lis:
      for i in ['[',']', '·', '(', ')', '‘', '’','\'', '"', ',','...','…', '특징주']:
        text_li = [x.replace(i, ' ') for x in text_li]
      word_lis.append((' '.join(text_li)).split())

    return word_lis

  def get_freq_codes(self, li):
    flatten_news_words = [j for sub in li for j in sub]
    news_words_frq = Counter(flatten_news_words).most_common()

    set_codes = set(self.code_df['회사명'].values)

    freq_codes = []
    for i in range(len([x[0] for x in news_words_frq])):
      if news_words_frq[i][0] in set_codes:
        freq_codes.append(news_words_frq[i])

    return freq_codes

In [41]:
def get_price(code, start:int='20180501', interval='day'):
    url ='https://api.finance.naver.com/siseJson.naver?symbol=' + code + ' &requestType=1&startTime=' \
    + start + '&endTime=' + str(datetime.datetime.now().date().strftime("%Y%m%d")) +  '&timeframe=' + interval

    result = requests.get(url, headers={'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'})
    bs_obj = BeautifulSoup(result.content, "html.parser")
    b = bs_obj.get_text()
    for i in ['\n', '\t', "\\", ' ']:
      b = b.replace(i,'')

    data = np.array(ast.literal_eval(b)[1:])

    dic = {'Date':list(map(lambda x: datetime.datetime.strptime(str(x), '%Y%m%d'), data[:,0])), \
      'Open':np.array(data[:,1], int), 'High':np.array(data[:,2], int), 'Low':np.array(data[:,3], int),\
       'Close':np.array(data[:,4], int), 'Volume':np.array(data[:,5], int)}

    df = pd.DataFrame(data=dic)
    df = df.set_index('Date')
    return df

def find_date(date, date_idx):
  date = date
  while date not in date_idx:
    date = date + datetime.timedelta(days=1)
  return date

def get_test_dates():
  li = []
  start = datetime.datetime(2016, 1, 1)
  end = datetime.datetime(2020, 1, 1)

  date = start
  while date != end:
    li.append(date)
    date += relativedelta(months=+1)

  return list(zip(li, [x+relativedelta(months=+1) for x in li]))

In [58]:
ia = Issue_anlayser('2021-01-01', '2021-02-01')

In [59]:
len(ia.issue_df)

140

In [44]:
hold_period = 20
dic = {'date':[],'codes':[],'profit':[]}
for i in get_test_dates():
  print(i)
  dic['date'].append(i)
  start_date = i[0]
  end_date = i[1]
  result_date = end_date+datetime.timedelta(days=hold_period)

  ia = Issue_anlayser(start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d"))

  # format
  code_names = [x[0] for x in ia.freq_codes][:5]
  print(code_names)
  code_idx = list(map(lambda code: [x for x in ia.code_df['회사명']].index(code), code_names))
  codes = [str(x).rjust(6, '0') for x in ia.code_df['종목코드'][code_idx]]
  dic['codes'].append(codes)

  mean = []
  profits = []
  for code in codes:
    price_df = get_price(code, '20140501')
    idx = price_df.index
    start = find_date(start_date, idx)
    end = find_date(end_date, idx)
    result = find_date(result_date, idx)

    profit = price_df.loc[result]['Close'] / price_df.loc[end]['Close']
    profits.append(profit)
    print(code, ': ', profit)
    mean.append(price_df.loc[result]['Close'] / price_df.loc[end]['Close'])
  dic['profit'].append(profits)
  print(sum(mean) / len(mean))

(datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0))
['한샘', '현대로템', '호텔신라', '삼성에스디에스', '한국항공우주']
009240 :  1.016304347826087
064350 :  1.0702479338842976
008770 :  1.0419790104947526
018260 :  0.9035874439461884
047810 :  0.9546079779917469
0.9973453428286145
(datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0))
['큐리언트', '삼성물산', 'LG전자', '삼부토건', '웅진']
115180 :  0.9388412017167382
028260 :  0.9741100323624595
066570 :  0.9675925925925926
001470 :  1.0
016880 :  1.024390243902439
0.980986814114846
(datetime.datetime(2016, 3, 1, 0, 0), datetime.datetime(2016, 4, 1, 0, 0))
['SK머티리얼즈', '티씨케이', '일진다이아', 'KB금융', '코데즈컴바인']
036490 :  1.0305785123966942
064760 :  1.0352303523035231
081000 :  1.0150384315472876
105560 :  1.0015552099533438
047770 :  0.7103004291845494
0.9585405870770796
(datetime.datetime(2016, 4, 1, 0, 0), datetime.datetime(2016, 5, 1, 0, 0))


KeyboardInterrupt: ignored

In [43]:
from scipy.stats.mstats import gmean
gmean([sum(x)/5 for x in pd.DataFrame.from_dict(dic)['profit'].values])

1.0179875166566366

In [None]:

# 한달, 20, 5개: 1.0179875166566366