# 0. Default Setting

### reference : http://blog.quantylab.com/crawling_naverfin_daycandle.html
#### solution for 404 not found :  https://stackoverflow.com/questions/42441211/python-urllib-error-httperror-http-error-404-not-found

In [1]:
# !pip install bs4

In [2]:
import datetime
import traceback
import pandas as pd
import os

from bs4 import BeautifulSoup
from urllib.request import urlopen, Request

In [3]:
### hyper-params
state = 'time' # 'day' or 'time'
str_datefrom = datetime.datetime.strftime(datetime.datetime(year=2020, month=6, day=10), '%Y.%m.%d')
str_dateto = datetime.datetime.strftime(datetime.datetime.today(), '%Y.%m.%d')
print(state, str_datefrom,'-', str_dateto, )

time 2020.06.10 - 2021.01.13


In [4]:
path_dir = './stock_data_new'
if not os.path.exists(path_dir):
    os.makedirs(path_dir)
    
main_url_index = 'https://finance.naver.com/sise/sise_index.nhn?code={code}'
main_url_item = 'https://finance.naver.com/item/sise.nhn?code={code}'

# 1. Functions

## 1.1. Get html from url

In [5]:
def get_html(url):
    try:
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})    
        html = urlopen(req)
        return BeautifulSoup(html)
    except Exception as e:
        traceback.print_exc()
    return None

In [6]:
get_html(main_url_index.format(code='KOSPI')).find_all('iframe')
# get_html(main_url_item.format(code='035420')).find_all('iframe')

[<iframe frameborder="0" height="0" id="autoFrame" marginheight="0" marginwidth="0" scrolling="no" src="/ac/reatcmp.nhn?menu=sise&amp;submenu=" style="display: none;" title="자동완성" width="400"></iframe>,
 <iframe frameborder="0" height="0" id="me_layers" name="test" scrolling="no" style="display:block;top: 22px; right: 209px; position: absolute; z-index: 15;" title="네이버미 영역" width="0"></iframe>,
 <iframe bottommargin="0" frameborder="0" height="265" marginheight="0" name="time" scrolling="no" src="/sise/sise_index_time.nhn?code=KOSPI&amp;thistime=20210113163400" title="시간별시세 영역" topmargin="0" width="100%"></iframe>,
 <iframe bottommargin="0" frameborder="0" height="265" marginheight="0" name="day" scrolling="no" src="/sise/sise_index_day.nhn?code=KOSPI" title="일별시세 영역" topmargin="0" width="100%"></iframe>,
 <iframe bottommargin="0" frameborder="0" height="305" marginheight="0" name="recent" scrolling="no" src="/template/right_item.nhn?code=KOSPI&amp;width=200" title="최근조회 영역" topmargin=

## 1.2. Get iframe url from the main page

In [7]:
def get_url(code, page=None):
    if code in ['KOSPI', 'KOSDAQ', 'FUT' ,'KPI200']:
        main_url = main_url_index
        if state == 'day' : attrs = {'title':'일별시세 영역'}
        else : attrs = {'title':'시간별시세 영역'}
    else:
        main_url = main_url_item
        if state == 'day' : attrs = {'title':'일별 시세'}
        else : attrs = {'title':'주요 시세'}
    url = main_url.format(code=code)
    
    soup = get_html(url)
    url = 'https://finance.naver.com' + soup.find('iframe', attrs=attrs)['src']
    
    if page: return url + '&page={page}'.format(page=page)
    else : return url
    

In [8]:
print(get_url('KOSPI'))
print(get_url('035420',1))

https://finance.naver.com/sise/sise_index_time.nhn?code=KOSPI&thistime=20210113163501
https://finance.naver.com/item/sise_time.nhn?code=035420&thistime=20210113161104&page=1


## 1.3. Find last page

In [9]:
def find_last_pg(code):
    url = get_url(code)
    print(url)
    soup = get_html(url)
#     print(soup)
    el_td_last = soup.find("td", class_="pgRR")
    pg_last = el_td_last.a.get('href').rsplit('&page=')[1]
    return int(pg_last)

In [10]:
print(find_last_pg('KOSPI'))
print(find_last_pg('035420'))

https://finance.naver.com/sise/sise_index_time.nhn?code=KOSPI&thistime=20210113163501
66
https://finance.naver.com/item/sise_time.nhn?code=035420&thistime=20210113161104
40


## 1.4. Parse stock values from html table

In [11]:
def parse_page(code, page):
    url = get_url(code, page)
    print(url)
    
    
    _soup = get_html(url)
    _df = pd.read_html(str(_soup.find("table")), header=0, index_col=0)[0]
    _df = _df.dropna()
    return _df

In [12]:
parse_page('KOSPI', 1)

https://finance.naver.com/sise/sise_index_time.nhn?code=KOSPI&thistime=20210113163501&page=1


Unnamed: 0_level_0,체결가,전일비,변동량(천주),거래량(천주),거래대금(백만)
체결시각,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
15:30,3148.29,22.34,20149.0,1555037.0,23964683.0
15:29,3144.17,18.22,0.0,1534888.0,23262656.0
15:28,3144.17,18.22,0.0,1534888.0,23262656.0
15:27,3144.17,18.22,0.0,1534888.0,23262656.0
15:26,3144.17,18.22,0.0,1534888.0,23262656.0
15:25,3144.17,18.22,0.0,1534888.0,23262656.0


In [13]:
parse_page('035420', 1)

https://finance.naver.com/item/sise_time.nhn?code=035420&thistime=20210113161104&page=1


Unnamed: 0_level_0,체결가,전일비,매도,매수,거래량,변동량
체결시각,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
15:58,314000.0,10000.0,314500.0,314000.0,1836028.0,31.0
15:57,314000.0,10000.0,314500.0,314000.0,1835997.0,2.0
15:56,314000.0,10000.0,314500.0,314000.0,1835995.0,23.0
15:55,314000.0,10000.0,314500.0,314000.0,1835972.0,5.0
15:54,314000.0,10000.0,314500.0,314000.0,1835967.0,8.0
15:51,314000.0,10000.0,314500.0,314000.0,1835959.0,11.0
15:50,314000.0,10000.0,314500.0,314000.0,1835948.0,34.0
15:49,314000.0,10000.0,314500.0,314000.0,1835914.0,16.0
15:48,314000.0,10000.0,314500.0,314000.0,1835898.0,6.0
15:47,314000.0,10000.0,314500.0,314000.0,1835892.0,22.0


# 2. Main

- state = 'time' : 당일의 시간별 시세
- state = 'day' : 일별 시세

In [14]:
### hyper-params

codes = ['035420']
# codes = ['KOSPI', '207940', '035420', '006400'] # 코스피, 삼성바이오로지스, NAVER, 삼성SDI


state = 'day' # 'day' or 'time'
str_datefrom = datetime.datetime.strftime(datetime.datetime(year=2020, month=6, day=10), '%Y.%m.%d')
str_dateto = datetime.datetime.strftime(datetime.datetime.today(), '%Y.%m.%d')
print(state, str_datefrom,'-', str_dateto,)

# state = 'time' # 'day' or 'time'
# str_datefrom = '12:00'
# str_dateto = '14:00'
# print(state, str_datefrom,'-', str_dateto, )

day 2020.06.10 - 2021.01.13


In [15]:
crawling_time = datetime.datetime.strftime(datetime.datetime.today(), '%Y%m%d-%H%M%S')
for code in codes:

    pg_last = find_last_pg(code)
    
    df = None
    for page in range(1, pg_last+1):
        _df = parse_page(code, page)
        
        if _df.index[-1] > str_dateto : continue
        if _df.index[0] < str_datefrom : break
        
        _df_filtered = _df[(_df.index >= str_datefrom) & (_df.index <= str_dateto)]
        if df is None:
            df = _df_filtered
        else:
            df = pd.concat([df, _df_filtered])

    df = df.sort_index()
    path = os.path.join(path_dir, '{code}_{date_from}_{date_to}_{crawl}.csv'.format(
                            code=code, date_from=str_datefrom, date_to=str_dateto, crawl=crawling_time))
    df.to_csv(path,encoding='utf-8')
    print(code, 'done')

https://finance.naver.com/item/sise_day.nhn?code=035420
https://finance.naver.com/item/sise_day.nhn?code=035420&page=1
https://finance.naver.com/item/sise_day.nhn?code=035420&page=2
https://finance.naver.com/item/sise_day.nhn?code=035420&page=3
https://finance.naver.com/item/sise_day.nhn?code=035420&page=4
https://finance.naver.com/item/sise_day.nhn?code=035420&page=5
https://finance.naver.com/item/sise_day.nhn?code=035420&page=6
https://finance.naver.com/item/sise_day.nhn?code=035420&page=7
https://finance.naver.com/item/sise_day.nhn?code=035420&page=8
https://finance.naver.com/item/sise_day.nhn?code=035420&page=9
https://finance.naver.com/item/sise_day.nhn?code=035420&page=10
https://finance.naver.com/item/sise_day.nhn?code=035420&page=11
https://finance.naver.com/item/sise_day.nhn?code=035420&page=12
https://finance.naver.com/item/sise_day.nhn?code=035420&page=13
https://finance.naver.com/item/sise_day.nhn?code=035420&page=14
https://finance.naver.com/item/sise_day.nhn?code=035420&p

In [16]:
df

Unnamed: 0_level_0,종가,전일비,시가,고가,저가,거래량
날짜,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020.06.10,244000.0,6500.0,241000.0,247500.0,240000.0,1431071.0
2020.06.11,240500.0,3500.0,244000.0,249000.0,239000.0,1423978.0
2020.06.12,242000.0,1500.0,229000.0,242500.0,228500.0,1318334.0
2020.06.15,231500.0,10500.0,240000.0,244500.0,230500.0,1311773.0
2020.06.16,243500.0,12000.0,239000.0,243500.0,236000.0,1155474.0
...,...,...,...,...,...,...
2021.01.07,289500.0,500.0,288500.0,292000.0,286500.0,1155734.0
2021.01.08,312000.0,22500.0,293500.0,312000.0,290000.0,3175396.0
2021.01.11,309000.0,3000.0,320000.0,323500.0,301000.0,2765169.0
2021.01.12,304000.0,5000.0,305000.0,308500.0,292500.0,1573299.0
