# Crawling Naver Stock Reports

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
# 1. URL
url = 'https://finance.naver.com/research/company_list.naver?&page=1'

In [3]:
# 2. Request(URL) > Response(HTML)
response = requests.get(url)
response

<Response [200]>

In [6]:
# 3. HTML > BeautifulSoup > css-selector > DataFrame
dom = BeautifulSoup(response.content, 'html.parser')
type(dom) # select(), select_one()

bs4.BeautifulSoup

In [25]:
selector = '#contentarea_left > div.box_type_m > table > tbody > tr:nth-child(3)'
selector = 'table.type_1 > tr' # len(elements) = 0 -> 우클릭 후 페이지 소스보기 결과 tbody가 없는듯 (문법상 tbody가 와야하는데 tbody가 빠짐)
elements = dom.select(selector)
len(elements)

49

In [30]:
element = elements[2]
tag = element.select('td')
len(tag), tag

(6,
 [<td style="padding-left:10">
  <a class="stock_item" href="/item/main.naver?code=066570" title="LG전자">LG전자</a>
  </td>,
  <td><a href="company_read.naver?nid=77126&amp;page=1">AI데이터센터 냉각 시장 공략</a><img alt="NEW" class="ico_new" height="8" src="https://ssl.pstatic.net/imgstock/images5/ico_research_new.gif" width="8"/></td>,
  <td>교보증권</td>,
  <td class="file"><a href="https://stock.pstatic.net/stock-research/company/34/20240923_company_481199000.pdf" target="_blank"><img align="absmiddle" alt="pdf" src="https://ssl.pstatic.net/imgstock/images5/down.gif"/></a></td>,
  <td class="date" style="padding-left:5px">24.09.23</td>,
  <td class="date">218</td>])

In [40]:
data = {}
data['stock_name'] = tag[0].select_one('a').text
data['stock_link'] = tag[0].select_one('a').get('href')
data['title'] = tag[1].select_one('a').text
data['title_link'] = tag[1].select_one('a').get('href')
data['writer'] = tag[2].text
data['pdf_link'] = tag[3].select_one('a').get('href')
data['date'] = tag[4].text
data['pv'] = tag[5].text

data

{'stock_name': 'LG전자',
 'stock_link': '/item/main.naver?code=066570',
 'title': 'AI데이터센터 냉각 시장 공략',
 'title_link': 'company_read.naver?nid=77126&page=1',
 'writer': '교보증권',
 'pdf_link': 'https://stock.pstatic.net/stock-research/company/34/20240923_company_481199000.pdf',
 'date': '24.09.23',
 'pv': '218'}

In [49]:
rows = []
for idx, element in enumerate(elements):
    
    tag = element.select('td')
    #print(idx, len(tag))
    
    if len(tag) == 6:    
        data = {}
        data['stock_name'] = tag[0].select_one('a').text
        data['stock_link'] = tag[0].select_one('a').get('href')
        data['title'] = tag[1].select_one('a').text
        data['title_link'] = tag[1].select_one('a').get('href')
        data['writer'] = tag[2].text
        data['pdf_link'] = tag[3].select_one('a').get('href')
        data['date'] = tag[4].text
        data['pv'] = tag[5].text
        
        rows.append(data)


In [52]:
df = pd.DataFrame(rows)
df.tail(2)

Unnamed: 0,stock_name,stock_link,title,title_link,writer,pdf_link,date,pv
28,동국S&C,/item/main.naver?code=100130,금리 하락으로 미국에서 훈풍이 불어온다,company_read.naver?nid=77098&page=1,iM증권,https://stock.pstatic.net/stock-research/compa...,24.09.19,1501
29,SOOP,/item/main.naver?code=067160,"경쟁사 스트리머 이적, 트래픽 유입 기대",company_read.naver?nid=77097&page=1,신한투자증권,https://stock.pstatic.net/stock-research/compa...,24.09.19,1455


In [53]:
# file download

In [56]:
# os package : 파일 시스템 관리
import os
# 파일목록 출력
os.listdir()

['08_gmarket.ipynb',
 '08_naver_stock_report.ipynb',
 '09_selenium.ipynb',
 '10_xpath.ipynb',
 '11_iterator_generator.ipynb',
 '12_scrapy.ipynb']

In [57]:
path = 'reports'
# 디렉토리, 파일 존재 여부 확인
os.path.exists(path)

False

In [58]:
if not os.path.exists(path): # 디렉토리가 없으면 True
    os.makedirs(path)
    
os.listdir()

['08_gmarket.ipynb',
 '08_naver_stock_report.ipynb',
 '09_selenium.ipynb',
 '10_xpath.ipynb',
 '11_iterator_generator.ipynb',
 '12_scrapy.ipynb',
 'reports']

In [63]:
# 1. url
title = df.loc[0,'title']
pdf_link = df.loc[0,'pdf_link']
title, pdf_link

('AI데이터센터 냉각 시장 공략',
 'https://stock.pstatic.net/stock-research/company/34/20240923_company_481199000.pdf')

In [65]:
# 2. request(url) > response(pdf)
response = requests.get(pdf_link)
response

<Response [200]>

In [67]:
# 3. pdf > save(reports)
filename = f'{path}/{title}.pdf'
print(filename)

# wb : write binary, wt : write text
with open(filename, 'wb') as file:
    file.write(response.content)

reports/AI데이터센터 냉각 시장 공략.pdf


In [None]:
# pickle : RAM > SSD : SSD > RAM

In [68]:
os.listdir('reports')

['AI데이터센터 냉각 시장 공략.pdf']

In [72]:
import shutil
path = 'reports'
#shutil.rmtree(path) # 폴더 삭제
#os.path.exists(path)
os.makedirs(path)

In [74]:
for idx, row in df.iterrows():
    print(idx, end=' ')
    title, pdf_link = row['title'], row['pdf_link']
    
    response = requests.get(pdf_link)
    filename = f'{path}/{title}.pdf'

    # with open(filename, 'wb') as file:
    #     file.write(response.content)
    # 저장 안하려고 주석 처리함.

0 AI데이터센터 냉각 시장 공략 https://stock.pstatic.net/stock-research/company/34/20240923_company_481199000.pdf
1 설계와 시공능력으로 액침냉각 사업 확대 https://stock.pstatic.net/stock-research/company/34/20240923_company_279682000.pdf
2 액침냉각, 기술적 강점을 확보해 나가는 중 https://stock.pstatic.net/stock-research/company/62/20240923_company_647832000.pdf
3 3Q24 Preview: 국내 수주 강세 VS 중국 부진 .. https://stock.pstatic.net/stock-research/company/57/20240923_company_621805000.pdf
4 Re-rating 구간 돌입 https://stock.pstatic.net/stock-research/company/63/20240923_company_747255000.pdf
5 높아지는 Peak sales https://stock.pstatic.net/stock-research/company/29/20240923_company_582032000.pdf
6 Metsera, 너는 계획이 다 있구나 https://stock.pstatic.net/stock-research/company/39/20240923_company_387902000.pdf
7 의대 열풍, 나만 믿어 https://stock.pstatic.net/stock-research/company/21/20240923_company_352867000.pdf
8 속도가 느려도, 방향성은 맞다 https://stock.pstatic.net/stock-research/company/39/20240923_company_584932000.pdf
9 빅파마들의 RPT 방향, 우리도 간다 https://stock.pstatic.net/stoc

In [None]:
# tika(java) : pdf > text