In [0]:
import bs4
from urllib.request import urlopen
import pandas as pd
import re
import json


class Naver:
    ##date 형식 맞춰주기
    def date_format(self, d=''):
        if d != '':
            this_date = pd.to_datetime(d).date()
        else:
            this_date = pd.Timestamp.today().date()   # 오늘 날짜를 지정
        return (this_date)
    

    ##개별종목 주가수집 함수
    def stock_price(self, historical_prices, stock_cd, start_date='', end_date='', page_n=1, last_page=0):

        #nvr = self.NaverPrice()
        start_date = self.date_format(start_date)
        end_date = self.date_format(end_date)

        naver_stock = 'http://finance.naver.com/item/sise_day.nhn?code=' + stock_cd + '&page=' + str(page_n)

        source = urlopen(naver_stock).read()
        source = bs4.BeautifulSoup(source, 'lxml')

        dates = source.find_all('span', class_='tah p10 gray03')   # 날짜 수집   
        prices = source.find_all('td', class_='num')   # 종가 수집
        
        for n in range(len(dates)):

            if len(dates) > 0:

                # 날짜 처리
                this_date = dates[n].text
                this_date = self.date_format(this_date)

                if this_date <= end_date and this_date >= start_date:   
                # start_date와 end_date 사이에서 데이터 저장
                    # 종가 처리
                    this_close = prices[n*6].text
                    this_close = this_close.replace(',', '')
                    this_close = float(this_close)

                    # 딕셔너리에 저장
                    historical_prices[this_date] = this_close

                elif this_date < start_date:   
                # start_date 이전이면 함수 종료
                    return (historical_prices)              

        # 페이지 네비게이션
        if last_page == 0:
            last_page = source.find_all('table')[1].find('td', class_='pgRR').find('a')['href']
            last_page = last_page.split('&')[1]
            last_page = last_page.split('=')[1]
            last_page = float(last_page)

        # 다음 페이지 호출
        if page_n < last_page:
            page_n = page_n + 1
            self.stock_price(historical_prices, stock_cd, start_date, end_date, page_n, last_page)   

        return (historical_prices)

    ##지수 수집
    def index_korea(self, historical_prices, index_cd, start_date='', end_date='', page_n=1, last_page=0):
    
        start_date = self.date_format(start_date)
        end_date = self.date_format(end_date)

        naver_index = 'http://finance.naver.com/sise/sise_index_day.nhn?code=' + index_cd + '&page=' + str(page_n)

        source = urlopen(naver_index).read()   # 지정한 페이지에서 코드 읽기
        source = bs4.BeautifulSoup(source, 'lxml')   # 뷰티풀 스프로 태그별로 코드 분류

        dates = source.find_all('td', class_='date')   # <td class="date">태그에서 날짜 수집   
        prices = source.find_all('td', class_='number_1')   # <td class="number_1">태그에서 지수 수집

        for n in range(len(dates)):

            if dates[n].text.split('.')[0].isdigit():

                # 날짜 처리
                this_date = dates[n].text
                this_date= self.date_format(this_date)

                if this_date <= end_date and this_date >= start_date:   
                # start_date와 end_date 사이에서 데이터 저장
                    # 종가 처리
                    this_close = prices[n*4].text   # prices 중 종가지수인 0,4,8,...번째 데이터 추출
                    this_close = this_close.replace(',', '')
                    this_close = float(this_close)

                    # 딕셔너리에 저장
                    historical_prices[this_date] = this_close

                elif this_date < start_date:   
                # start_date 이전이면 함수 종료
                    return (historical_prices)              

        # 페이지 네비게이션
        if last_page == 0:
            last_page = source.find('td', class_='pgRR').find('a')['href']
            # 마지막페이지 주소 추출
            last_page = last_page.split('&')[1]   # & 뒤의 page=506 부분 추출
            last_page = last_page.split('=')[1]   # = 뒤의 페이지번호만 추출
            last_page = int(last_page)   # 숫자형 변수로 변환

        # 다음 페이지 호출
        if page_n < last_page:   
            page_n = page_n + 1   
            self.index_korea(historical_prices, index_cd, start_date, end_date, page_n, last_page)   

        return (historical_prices)  
    
    ## 구성종목 기본정보
    def stock_info(self, stock_cd):
        url_float = 'http://companyinfo.stock.naver.com/v1/company/c1010001.aspx?cmp_cd=' + stock_cd
        source = urlopen(url_float).read()
        soup = bs4.BeautifulSoup(source, 'lxml')

        tmp = soup.find(id='cTB11').find_all('tr')[6].td.text
        tmp = tmp.replace('\r', '')
        tmp = tmp.replace('\n', '')
        tmp = tmp.replace('\t', '')

        tmp = re.split('/', tmp)

        outstanding = tmp[0].replace(',', '')
        outstanding = outstanding.replace('주', '')
        outstanding = outstanding.replace(' ', '')
        outstanding = int(outstanding)

        floating = tmp[1].replace(' ', '')
        floating = floating.replace('%', '')
        floating = float(floating)

        name = soup.find(id='pArea').find('div').find('div').find('tr').find('td').find('span').text

        #k10_outstanding[stock_cd] = outstanding
        #k10_floating[stock_cd] = floating
        #k10_name[stock_cd] = name    
        
        return (name, outstanding, floating)

    
    
    def index_global(self, d, symbol, start_date='', end_date='', page=1):

        end_date = self.date_format(end_date)
        if start_date == '':
            start_date = end_date - pd.DateOffset(years=1)
        start_date = self.date_format(start_date)

        url = 'https://finance.naver.com/world/worldDayListJson.nhn?symbol='+symbol+'&fdtc=0&page='+str(page)
        raw = urlopen(url)
        data = json.load(raw)

        if len(data) > 0:

            for n in range(len(data)):
                date = pd.to_datetime(data[n]['xymd']).date()

                if date <= end_date and date >= start_date:   
                # start_date와 end_date 사이에서 데이터 저장
                    # 종가 처리
                    price = float(data[n]['clos'])
                    # 딕셔너리에 저장
                    d[date] = price
                elif date < start_date:   
                # start_date 이전이면 함수 종료
                    return (d)              

            if len(data) == 10:
                page += 1
                self.index_global(d, symbol, start_date, end_date, page)

        return (d)
    
    
class NaverStockInfo:
    ##기업정보
    def read_src(self, stock_cd):
        url_float = 'http://companyinfo.stock.naver.com/v1/company/c1010001.aspx?cmp_cd=' + stock_cd
        source = urlopen(url_float).read()
        soup = bs4.BeautifulSoup(source, 'lxml')
        return (soup)
        
    
    def stock_info(self, stock_cd):
        url_float = 'http://companyinfo.stock.naver.com/v1/company/c1010001.aspx?cmp_cd=' + stock_cd
        source = urlopen(url_float).read()
        soup = bs4.BeautifulSoup(source, 'lxml')

        tmp = soup.find(id='cTB11').find_all('tr')[6].td.text
        tmp = tmp.replace('\r', '')
        tmp = tmp.replace('\n', '')
        tmp = tmp.replace('\t', '')

        tmp = re.split('/', tmp)

        outstanding = tmp[0].replace(',', '')
        outstanding = outstanding.replace('주', '')
        outstanding = outstanding.replace(' ', '')
        outstanding = int(outstanding)

        floating = tmp[1].replace(' ', '')
        floating = floating.replace('%', '')
        floating = float(floating)

        name = soup.find(id='pArea').find('div').find('div').find('tr').find('td').find('span').text
       
        return (name, outstanding, floating)
      
    ##발행주식수
    def outstanding(self, stock_cd):
        soup = self.read_src(stock_cd)
        tmp = soup.find(id='cTB11').find_all('tr')[6].td.text
        tmp = tmp.replace('\r', '')
        tmp = tmp.replace('\n', '')
        tmp = tmp.replace('\t', '')
        tmp = re.split('/', tmp)
        outstanding = tmp[0].replace(',', '')
        outstanding = outstanding.replace('주', '')
        outstanding = outstanding.replace(' ', '')
        outstanding = int(outstanding)
        return (outstanding)
      
    ##유동주식수
    def floating(self, stock_cd):
        soup = self.read_src(stock_cd)
        tmp = soup.find(id='cTB11').find_all('tr')[6].td.text
        tmp = tmp.replace('\r', '')
        tmp = tmp.replace('\n', '')
        tmp = tmp.replace('\t', '')
        tmp = re.split('/', tmp)
        floating = tmp[1].replace(' ', '')
        floating = floating.replace('%', '')
        floating = float(floating)
        return (floating)
    
    ##formatting
    def float_convert(self, s):
        try:
            s = s.replace(' ', '')
            s = s.replace(',', '')
            if re.findall('억', s):
                m = 100000000
                s = s.replace('억', '')
            elif re.findall('백만', s):
                m = 1000000
                s = s.replace('백만', '')
            if re.findall('%', s):
                m = 0.01
                s = s.replace('%', '')
            s = s.replace('원', '')
            f = float(s) * m
        except:
            f = s
        return (f)
    
    ##fundamental
    def fundamentals(self, stock_cd, f):
        factors = dict()
        soup = self.read_src(stock_cd)
        rows = len(soup.find_all('div', class_='fund fl_le')[0].find_all('tr'))
        for r in range(1, rows, 1):
            title = soup.find_all('div', class_='fund fl_le')[0].find_all('tr')[r].find_all('th')[0].text
            value_current = soup.find_all('div', class_='fund fl_le')[0].find_all('tr')[r].find_all('td')[0].text
            value_current = self.float_convert(value_current)
            value_estimated = soup.find_all('div', class_='fund fl_le')[0].find_all('tr')[r].find_all('td')[1].text
            value_estimated = self.float_convert(value_estimated)
            factors[title] = [value_current, value_estimated]
            print(title, value_current, value_estimated)
        return (factors[f])

In [0]:
import pandas as pd
import random
import time
from selenium import webdriver
from datetime import datetime
class NaverCrawler:
    
    def __init__(self, keywords, page_cnt):
        options = webdriver.ChromeOptions()
        options.add_argument('headless')
        options.add_argument('window-size=1920x1080')
        options.add_argument("disable-gpu")
        options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
        self.driver = webdriver.Chrome(chrome_options=options)
        self.keywords = keywords
        self.hos2url={}
        self.url2hos={}
        self.power_link_df=None
        self.place_df=None
        self.errIdx=[]
        self.page_cnt=page_cnt
    
    def crawl(self):
        url = 'https://search.naver.com/search.naver?sm=top_hty&fbm=1&ie=utf8&query={}'
        for idx in range(len(self.keywords)):
            self.driver.get(url.format(self.keywords[idx]))
            self.driver.implicitly_wait(random.randrange(2,4))
            self.scroll()
            try:
                self.power_link_crawl()
            except Exception as e:
                print(idx,'no power link : ',e)
                self.errIdx.append(('power',idx))
                
            try:
                self.place_crawl()
            except Exception as e:
                print(idx,'no place: ',e)
                self.errIdx.append(('place',idx))
    
    def power_link_crawl(self):
        title_list = []
        url_list = []
        text_list = []
        score_list = []
        link_titles=self.driver.find_element_by_id('power_link_body').find_elements_by_class_name('lnk_tit')
        link_urls=self.driver.find_element_by_id('power_link_body').find_elements_by_class_name('lnk_url')
        link_texts=self.driver.find_element_by_id('power_link_body').find_elements_by_class_name('ad_dsc_inner')
        score=1
        
        for title,url,text in zip(link_titles,link_urls,link_texts):
            title_list.append(title.text)
            url_list.append(url.text)
            text_list.append(text.text)
            score_list.append(score)
            score=score-1/15
        tmp_df=pd.DataFrame({'power_title':title_list,'power_url_list':url_list,'power_text':text_list,'power_score':score_list})
        if type(self.power_link_df)==type(None):
            self.power_link_df = tmp_df
        else:
            self.power_link_df = self.power_link_df.append(tmp_df, ignore_index = True)
            
    def place_crawl(self):
        place_title_list = []
        place_url_list = []
        place_score_list=[]

        for idx in range(self.page_cnt):
            score = 1-idx*0.1
            tmp_element_list = self.driver.find_elements_by_xpath("*//div[@class='list_area']//a[@class='name']")
            place_score_list.extend([score]*len(tmp_element_list))
            for element in tmp_element_list:
                place_title_list.append(element.text.split(' ')[1])
                place_url_list.append(element.get_attribute('href'))
            self.click_next_btn()
            time.sleep(random.random()/2) 
        tmp_df = pd.DataFrame({'place_title':place_title_list,'place_url':place_url_list,'place_score':place_score_list})
        tmp_df.head()
        if type(self.place_df) == type(None):
            self.place_df = tmp_df
        else:
            self.place_df = self.place_df.append(tmp_df, ignore_index = True)
    
    def scroll(self):
        for i in range(1,11):
            self.driver.execute_script("window.scrollTo(document.body.scrollHeight/10*"+str(i-1)+", document.body.scrollHeight/11*"+str(i)+");")
            time.sleep(0.05)
    
    def click_next_btn(self):
        self.driver.find_element_by_xpath("*//a[@class='btn_direction btn_next ']").click()
    
    def get_power_link_df(self):
        return self.power_link_df
    
    def get_place_df(self):
        return self.place_df
    
    def save_power_link_as_csv(self):
        pre_file_name = '{}-{}-{}-'.format(datetime.now().year,datetime.now().month,datetime.now().day)
        self.power_link_df.to_csv(pre_file_name+'power.csv', encoding='euc-kr')
        
    def save_place_as_csv(self):
        pre_file_name = '{}-{}-{}-'.format(datetime.now().year,datetime.now().month,datetime.now().day)
        self.place_df.to_csv(pre_file_name+'place.csv', encoding='euc-kr')
if __name__ == "__main__":
    pre_file_name = '{}-{}-{}-'.format(datetime.now().year,datetime.now().month,datetime.now().day)
    df = pd.read_csv(pre_file_name+'연관검색어.csv', encoding='euc-kr',index_col=0)
    nc = NaverCrawler(df['키워드'],5)
    nc.crawl()
    nc.get_place_df().to_csv(pre_file_name+'place.csv', encoding='euc-kr')
    nc.get_power_link_df().to_csv(pre_file_name+'power.csv', encoding='euc-kr')
    pd.Series(nc.errIdx).to_csv('err_log.csv')      