In [1]:
import sys
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import pandas as pd

In [2]:
URL = 'https://finance.naver.com/item/sise_day.nhn'
HEADERS = {
    'User-Agent':
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4501.0 Safari/537.36 Edg/92.0.891.1'
}


In [24]:
def parse(bs: BeautifulSoup) -> dict:
    # ['date', 'close', 'delta', 'open', 'high', 'low', 'volume']
    clean = lambda s: s.strip().replace(',', '').replace('.', '-')
    values = [span.text for span in bs.findAll('span', class_='tah')]
    values = list(map(clean, values))

    def partition(line, n):
        for i in range(0, len(line), n):
            yield line[i:i + n]

    for row in partition(values, 7):
        yield {
            'Date': row[0],
            'Open': row[3],
            'High': row[4],
            'Low': row[5],
            'Close': row[1],
            'Adj Close': row[1],
            'Volume': row[6],
        }

In [4]:
symbol = '372330'
date_from = '2020-12-17'

class OutOfPeriod(Exception):
    pass

In [5]:
def session():
    s = requests.session()
    r = Retry(total=5,
              backoff_factor=0.2,
              status_forcelist=[413, 429, 500, 502, 503, 504])
    a = HTTPAdapter(max_retries=r)
    s.mount('http://', a)
    s.mount('https://', a)
    return s
    
s = session()
r = s.get(URL, 
          params={'code': symbol, 'page': 1}, 
          headers=HEADERS)


In [None]:
r.text

In [7]:
bs = BeautifulSoup(r.text, 'html.parser')

if bs.find('td', class_='pgRR') is None:
    print('[ERROR] Invalid symbol: {}'.format(symbol), file=sys.stderr)
    raise RuntimeError('No data found with symbol')

In [8]:
bs = BeautifulSoup(r.text, 'html.parser')
if bs.find('span', class_='tah').text == '':
    raise FileNotFoundError()

In [9]:
bs.findAll('span', class_='tah')

[<span class="tah p10 gray03">2022.02.11</span>,
 <span class="tah p11">7,415</span>,
 <span class="tah p11 nv01">
 				65
 				</span>,
 <span class="tah p11">7,455</span>,
 <span class="tah p11">7,550</span>,
 <span class="tah p11">7,385</span>,
 <span class="tah p11">174,788</span>,
 <span class="tah p10 gray03">2022.02.10</span>,
 <span class="tah p11">7,480</span>,
 <span class="tah p11 nv01">
 				15
 				</span>,
 <span class="tah p11">7,615</span>,
 <span class="tah p11">7,630</span>,
 <span class="tah p11">7,430</span>,
 <span class="tah p11">136,811</span>,
 <span class="tah p10 gray03">2022.02.09</span>,
 <span class="tah p11">7,495</span>,
 <span class="tah p11 red02">
 				245
 				</span>,
 <span class="tah p11">7,330</span>,
 <span class="tah p11">7,515</span>,
 <span class="tah p11">7,330</span>,
 <span class="tah p11">151,537</span>,
 <span class="tah p10 gray03">2022.02.08</span>,
 <span class="tah p11">7,250</span>,
 <span class="tah p11 nv01">
 				110
 				</span>,

In [21]:
l = []
for r in parse(bs):
    if date_from > r['Date']:
        raise OutOfPeriod()

    l.append(r)

df = pd.DataFrame(l)
df.set_index('Date')


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-02-11,7455,7550,7385,7415,7415,174788
2022-02-10,7615,7630,7430,7480,7480,136811
2022-02-09,7330,7515,7330,7495,7495,151537
2022-02-08,7420,7420,7180,7250,7250,150569
2022-02-07,7440,7490,7280,7360,7360,98933
2022-02-04,7500,7500,7310,7440,7440,156798
2022-02-03,7385,7470,7255,7455,7455,81682
2022-01-28,7265,7265,7030,7170,7170,193977
2022-01-27,7380,7570,7155,7160,7160,280062
2022-01-26,7550,7555,7345,7460,7460,263660
