In [None]:
import numpy as np
import pandas as pd 
import time
import requests
import lxml
from lxml.html import fromstring
from fake_useragent import UserAgent

ua = UserAgent()
url = "주소"
headers = {'User-Agent' : ua.random}

# 1. URL 수집

In [None]:
df = pd.DataFrame(columns=['url'])
for PAGE in range(1,132):
    
    # url 호출
    url = f'https://terms.naver.com/list.naver?cid=40942&categoryId=33147&so=st3.asc&viewType=&categoryType=&page={PAGE}'
    req = requests.get(url)
    doc  = fromstring(req.text)
    elements = doc.xpath('/html/body/div[1]/div[3]/div[1]/div[3]/ul/li/div/div[1]/strong/a[1]')
    
    # 정보수집 및 데이터프레임 추가
    values = ['https://terms.naver.com'+element.get('href') for element in elements]
    tmp = pd.DataFrame({'url':values})
    df = df.append(tmp, ignore_index=True)
    
    if PAGE % 10 == 0:
        print(f'{PAGE}페이지 완료!!')

# 2. 세부정보 수집
#### 산 이름, 한문, 요약, 위치, 봉우리 높이, 문화재, 본문 (이미지 제외) , 참조항목, 역참조항목, 카테고리

In [None]:
# 데이터프레임 초기화
df_final = pd.DataFrame()
t1 = time.time()


for idx in range(df.shape[0]):
    
    # url의 html을 호출
    url = df.loc[idx, 'url']
    req = requests.get(url)
    doc  = fromstring(req.text)

    # 정보 초기화
    MountainName, MountainName_Chinese, SUMMARY, LOCATION, HEIGHT, BONG, CULTURE, CONTENT, REFERENCE, REFERENCE_reverse, CATEGORY = ('', '', '', '', '', '', '', '', '', '', '')

    # 수집
    ### 산이름
    try:
        MountainName = doc.xpath('//*[@id="content"]/div[2]/div[1]/h2')[0].text
    except:
        MountainName = ''
    
    ### 산한자이름
    try:
        MountainName_Chinese = doc.xpath('//*[@id="content"]/div[2]/div[1]/p[3]/span')[0].text
    except:
        MountainName_Chinese = ''

    ### 요약
    try:
        SUMMARY = doc.xpath('/html/body/div[1]/div[3]/div[1]/div[2]/div[2]/dl/text()')[1].replace('\n','').replace('\t','')
    except:
        SUMMARY = ''
        
################################################################################################
    ### 위치~문화재
    ##### 위치~문화재는 페이지별로 나타난 형식이 달라 다를 수 있어 두 가지 중 한 가지로 채워넣음
    for headline, element in zip(doc.xpath('/html/body/div[1]/div[3]/div[1]/div[2]/div[2]/div/div/div/div/table/tbody/tr/th'), doc.xpath('/html/body/div[1]/div[3]/div[1]/div[2]/div[2]/div/div/div/div/table/tbody/tr/td')):
        tmp_hd = ''.join(headline.itertext())
        tmp_el = ''.join(element.itertext()).replace('\n','').replace('\t','').strip()

        if tmp_hd == '위치':
            LOCATION = tmp_el

        elif tmp_hd == '높이':
            HEIGHT = tmp_el

        elif tmp_hd == '봉우리':
            BONG = tmp_el

        elif tmp_hd == '문화재':
            CULTURE = tmp_el
            CULTURE = {idx+1:culture.strip() for idx, culture in enumerate(CULTURE.split(','))}
            
        
    for headline, element in zip(doc.xpath('//*[@id="size_ct"]/div/div/div[2]/div/table/tbody/tr/th'), doc.xpath('//*[@id="size_ct"]/div/div/div[2]/div/table/tbody/tr/td')):
        tmp_hd = ''.join(headline.itertext())
        tmp_el = ''.join(element.itertext()).replace('\n','').replace('\t','').strip()

        if tmp_hd == '위치':
            LOCATION = tmp_el

        elif tmp_hd == '높이':
            HEIGHT = tmp_el

        elif tmp_hd == '봉우리':
            BONG = tmp_el

        elif tmp_hd == '문화재':
            CULTURE = tmp_el
            CULTURE = {idx+1:culture.strip() for idx, culture in enumerate(CULTURE.split(','))}
################################################################################################

    ### 본문
    try:
        CONTENT = ''.join(doc.xpath('//*[@id="size_ct"]/p')[0].itertext()).replace('\n','').replace('\t','')
    except:
        CONTENT = ''
    
    ### 참조항목 ~ 카테고리
    for element in doc.xpath('/html/body/div[1]/div[3]/div[1]/div[2]/div[3]/div'):
        tmp = ''.join(element.itertext())

        if tmp[:4] == '참조항목':
            REFERENCE = tmp[4:]
            REFERENCE = {idx+1:refer.strip() for idx, refer in enumerate(REFERENCE.split(','))}

        elif tmp[:5] == '역참조항목':
            REFERENCE_reverse = tmp[5:]
            REFERENCE_reverse = {idx+1:refer.strip() for idx, refer in enumerate(REFERENCE_reverse.split(','))}

        elif tmp.replace('\n',' ').replace('\t','')[1:5] == '카테고리':
            CATEGORY = tmp.replace('\n',' ').replace('\t','').strip()[4:].strip()
            CATEGORY = {idx+1:category for idx, category in enumerate(list(map(lambda x: x.strip(), CATEGORY.split('   '))))}

    ### 데이터프레임에 추가
    tmp = pd.DataFrame({'산이름':[MountainName],
                            '산이름_한자':[MountainName_Chinese],
                            '요약':[SUMMARY],
                            '위치':[LOCATION],
                            '높이':[HEIGHT],
                            '봉우리':[BONG],
                            '문화재':[CULTURE],
                            '본문':[CONTENT],
                            '참조항목':[REFERENCE],
                            '역참조항목':[REFERENCE_reverse],
                            '카테고리':[CATEGORY]})

    df_final = df_final.append(tmp, ignore_index=True)
    
    
    # log
    if (idx+1) % 100 == 0:
        print(f'=== {idx+1}개 수집 완료!! == 경과시간: {round(time.time()-t1)}초 ===')
        
        
print('크롤링 완료!')
df_final