In [3]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
from selenium.webdriver.common.action_chains import ActionChains

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd
import requests

import time
import re

from blueribbon_crawling import BlueRibbonCrawler
from instagram_crawling import InstagramCrawler

In [25]:
class DatePopCrawler:
    def __init__(self, location, keyword):
        self.location = location
        self.keyword = keyword
        self.search_word = location + " " + keyword

        self.data = pd.DataFrame(columns=['name', 'category', 'is_food', 'instagram_link', 'instagram_post', 'instagram_follower', 'visitor_review_count', 
                           'blog_review_count', 'distance_from_subway', 'on_tv','parking_available' , 'no_kids', 'pet_available', 'seoul_michelin',
                           'age-2030', 'gender-balance', 'on_blue_ribbon', 'image_urls'])
        self.empty_searchIframe = """//*[@id="_pcmap_list_scroll_container"]"""
        self.empty_entryIframe = """//*[@id="app-root"]"""
        self.empty_root = """//*[@id="root"]"""

        self.search_iframe = """//*[@id="searchIframe"]"""
        self.entry_iframe = """//*[@id="searchIframe"]"""

        self.store_dict = {
            'name': "",
            'category': "",
            'is_food': False,
            "instagram_link": None,
            "instagram_post": None,
            "instagram_follower": None,
            "visitor_review_count": 0,
            "blog_review_count": 0,
            "distance_from_subway": None,
            "on_tv": False,
            "parking_available": False,
            "no_kids": False,
            "pet_available": False,
            "seoul_michelin": False,
            "age-2030" : None,
            "gender-balance": None,
            "on_blue_ribbon": None,
            "image_urls": [],
        }

        if keyword == "맛집":
            self.blue_ribbon_crawler = BlueRibbonCrawler(self.location)
            self.store_dict["is_food"] = True
            self.blue_ribbon_crawler.crawling()

        self.instagram_crawler = InstagramCrawler()


        self.driver = self.initialize_driver()
        self.wait = WebDriverWait(self.driver, 10)
        self.driver.implicitly_wait(10)


    def initialize_driver(self):
        options = webdriver.ChromeOptions()
        options.add_argument("--enable-logging")
        options.add_argument("--v=1")  # 로그 레벨 설정

        driver = webdriver.Chrome(options=options)
        driver.get("https://map.naver.com/")

        driver.execute_script("window.open('');")
        driver.switch_to.window(driver.window_handles[0])

        return driver
    
    def search_keyword(self):
        
        self.driver.find_element(By.XPATH, self.empty_root)
        css_selector = ".input_search"
        elem = self.driver.find_element(By.CSS_SELECTOR, css_selector)
        time.sleep(1.5)
        elem.send_keys(self.search_word)
        time.sleep(1.5)
        elem.send_keys(Keys.RETURN)

    def get_into_store(self, i):
        self.driver.switch_to.default_content()
        self.driver.find_element(By.XPATH, self.empty_root)
        searchIframe = self.driver.find_element(By.XPATH, self.search_iframe)
        self.driver.switch_to.frame(searchIframe)
        self.driver.find_element(By.XPATH, self.empty_searchIframe)

        store_xpath = f"""//*[@id="_pcmap_list_scroll_container"]/ul/li[{i}]/div[1]/a[1]"""
        elem = self.wait.until(EC.element_to_be_clickable((By.XPATH, store_xpath)))
        time.sleep(2)
        if i!=1:
            self.driver.execute_script("arguments[0].scrollIntoView(true);", elem)
        self.driver.execute_script("arguments[0].click()", elem)

        time.sleep(1)

        self.driver.switch_to.default_content()
        self.driver.find_element(By.XPATH, self.empty_root)
        self.wait.until(EC.frame_to_be_available_and_switch_to_it((By.ID, "entryIframe")))

        # # 가끔 매장 클릭 시 "요청하신 페이지를 찾을 수 없습니다"라는 메시지가 뜸
        # # 이럴 경우, "새로고침" 버튼 실행
        self.driver.implicitly_wait(1)
        try:
            self.driver.find_element(By.XPATH, "//div[contains(text(), '요청하신 페이지를 찾을 수 없습니다.')]")
            reset_xpath = """//a[contains(text(), "새로고침)]"""
            reset_elem = self.driver.find_element(By.XPATH, reset_xpath)
            self.driver.execute_script("arguments[0].click()", reset_elem)
        except:
            pass

        self.driver.implicitly_wait(10)

    def get_store_details(self):
        # 매장 이름, 카테고리
        store_name_xpath = """//*[@id="_title"]/div/span"""
        elem = self.wait.until(EC.presence_of_all_elements_located((By.XPATH, store_name_xpath)))
        # elem = self.driver.find_elements(By.XPATH, store_name_xpath)

        self.store_dict['name'] = elem[0].text
        self.store_dict['category'] = elem[1].text

        self.driver.implicitly_wait(1)

        # 인스타그램 확인 및 크롤링
        try:
            elem = self.driver.find_element(By.XPATH, value= "//a[contains(text(), '인스타그램')]")
            instagram_url = elem.get_attribute('href')

            if "?" in instagram_url:
                instagram_url = instagram_url.split("?")[0]

            if instagram_url.count("/") >= 4:
                instagram_url = "/".join(instagram_url.split("/", 4)[:4])

            self.store_dict['instagram_link'] = instagram_url

        except NoSuchElementException:
            print("인스타 계정 없음")
            self.store_dict['instagram_link'] = None
            self.store_dict['instagram_post'] = None
            self.store_dict['instagram_follower'] = None
        except Exception as e:
            print(e)
            self.store_dict['instagram_link'] = None
            self.store_dict['instagram_post'] = None
            self.store_dict['instagram_follower'] = None

        # 방문자 리뷰, 블로그 리뷰 개수
        try:
            elem_visitor = self.driver.find_element(By.XPATH, value="//a[contains(text(), '방문자리뷰')]")
            elem_blog = self.driver.find_element(By.XPATH, value="//a[contains(text(), '블로그리뷰')]")

            visitor_review_count = int(re.findall(r'\d+', elem_visitor.text.replace(",", ""))[0])
            blog_review_count = int(re.findall(r'\d+', elem_blog.text.replace(",", ""))[0])

            self.store_dict['visitor_review_count'] = visitor_review_count
            self.store_dict['blog_review_count'] = blog_review_count
        except NoSuchElementException as e:
            print(e)
            self.store_dict['visitor_review_count'] = None
            self.store_dict['blog_review_count'] = None


        # 서울 미쉐린 가이드
        try:
            michelin_xpath = """//div[a[contains(text(), '미쉐린 가이드 서울')]]"""
            self.driver.find_element(By.XPATH, michelin_xpath)
            self.store_dict['seoul_michelin'] = True
        except:
            self.store_dict['seoul_michelin'] = False
            
        # 지하철역 출구로부터 거리
        try:
            subway_xpath = "/html/body/div[3]/div/div/div/div[5]/div/div[2]/div[1]/div/div[1]/div/div"
            elem = self.driver.find_element(By.XPATH, subway_xpath)
            text = elem.text

            numbers = re.findall(r'\d+', text)
            if numbers:
                self.store_dict["distance_from_subway"] = numbers[-1]
        except:
            self.store_dict["distance_from_subway"] = None

        # 방송 출연 여부
        try:
            tv_xpath =  """//strong[descendant::span[text()='TV방송정보']]"""
            self.driver.find_element(By.XPATH, tv_xpath)
            self.store_dict['on_tv'] = True
        except NoSuchElementException:
            self.store_dict['on_tv'] = False
        

        # 주차 가능, 반려동물 동반, 노키즈존
        try:
            convenient_xpath = "//strong[descendant::span[text()='편의']]/ancestor::div[1]/div/div"
            elem = self.driver.find_element(By.XPATH, convenient_xpath)
            convenients = elem.text

            for parking in ["주차", "발렛파킹"]:
                if parking in convenients:
                    self.store_dict["parking_available"] = True
                    break

            if "반려동물 동반" in convenients:
                self.store_dict["pet_available"] = True

            if "노키즈존" in convenients:
                self.store_dict["no_kids"] = True
        except NoSuchElementException as e:
            self.store_dict["parking_available"] = None
            self.store_dict["no_kids"] = None
            self.store_dict["pet_available"] = None

        # DataLab: 연령별 / 성별 검색 인기도
        try:
            last_height = self.driver.execute_script("return document.body.scrollHeight")

            while True:
                self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(0.5)
                new_height = self.driver.execute_script("return document.body.scrollHeight")
                if new_height == last_height:
                    break
                last_height = new_height

            datalab_xpath = """//div[h2/span[contains(text(), '데이터랩')]]"""

            datalab_elem = self.driver.find_element(By.XPATH, datalab_xpath)
            self.driver.execute_script("arguments[0].scrollIntoView(true);", datalab_elem)

            try:
                theme_keyword_xpath = """.//div/div/div/h3[contains(text(), '테마키워드')]"""
                datalab_elem.find_element(By.XPATH, theme_keyword_xpath)

                # 더보기 버튼이 존재하는 경우
                button_elem = datalab_elem.find_element(By.XPATH, ".//div[2]/div/a")
                self.driver.execute_script("arguments[0].click()", button_elem)
            except:
                pass
            
            age_elements = self.driver.find_elements(By.XPATH, """//*[@id="bar_chart_container"]/ul/li/div[1]/span/span[1]""")
            percentage_by_age = [round(float(item.text.replace('%', '')), 2) for item in age_elements]

            top_two = sorted(percentage_by_age, reverse=True)[:2]
            is_in_top_two = percentage_by_age[1] in top_two and percentage_by_age[2] in top_two

            if is_in_top_two:
                self.store_dict["age-2030"] = True
            else:
                self.store_dict["age-2030"] = False

            gender_elements = self.driver.find_elements(By.XPATH, """//*[@id="pie_chart_container"]/div/*[local-name()='svg']/*[local-name()='g'][1]/*[local-name()='g'][3]/*[local-name()='g'][4]/*[local-name()='g']/*[local-name()='text'][2]""")

            female, male = [round(float(item.text.replace("%", "")), 0) for item in gender_elements]
            
            if male > 50:
                self.store_dict["gender-balance"] = False
            else:
                self.store_dict["gender-balance"] = True
        except NoSuchElementException:
            print("DataLab 없음")
            self.store_dict["age-2030"] = None
            self.store_dict["gender-balance"] = None

        if self.store_dict["is_food"] == True:
            if self.store_dict["name"] in self.blue_ribbon_crawler.data["name"].values:
                self.store_dict["on_blue_ribbon"] = True
            else:
                self.store_dict["on_blue_ribbon"] = False

        # 대표사진 크롤링
        try:
            imgtab_xpath = "//a[.//span[contains(text(),'사진')]]"
            elem = self.driver.find_element(By.XPATH, imgtab_xpath)
            elem.click()
            time.sleep(2)

            images_xpath = """/html/body/div[3]/div/div/div/div[6]/div[4]/div/div/div/div/a/img"""
        
            WebDriverWait(self.driver, 5).until(
                EC.presence_of_all_elements_located((By.XPATH, images_xpath))
            )

            images = self.driver.find_elements(By.XPATH, images_xpath)
            image_urls = [img.get_attribute('src') for img in images][:10]

            self.store_dict["image_urls"] = image_urls
        except:
            pass

        # 인스타그램 크롤링
        if self.store_dict['instagram_link'] != None:
            try:
                instagram_embed_url = self.store_dict['instagram_link'] + "/embed"

                self.driver.switch_to.window(self.driver.window_handles[1])
                self.driver.get(instagram_embed_url)

                time.sleep(2)
            except Exception as e:
                print(e)
            self.driver.switch_to.window(self.driver.window_handles[0])
        
        print(self.store_dict)

        self.driver.implicitly_wait(5)
        
    def insert_into_dataframe(self):
        new_data = pd.DataFrame([self.store_dict])
        self.data = pd.concat([self.data, new_data], ignore_index=True)

    def crawling_one_page(self, page):
        # 첫 페이지는 54개의 매장
        # 다른 페이지는 50개의 매장 =
        store_count = 51
        if page == 1:
            store_count = 55
        for i in range(1, store_count):
            print("="*3+f"{i} 번째 매장"+ "="*3)
            self.get_into_store(i=i)
            self.get_store_details()
            self.insert_into_dataframe()

    def move_to_next_page(self):
        self.driver.switch_to.default_content()
        self.driver.find_element(By.XPATH, self.empty_root)
        self.wait.until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, self.search_iframe)))
        next_page_button = self.driver.find_element(By.XPATH, "//a[span[contains(text(),'다음페이지')]]")
        next_page_button.click()

        time.sleep(2)

In [28]:
if __name__ == "__main__":
    
    location = "신사역"
    keyword = "맛집"
    search_word = location + " " + keyword

    crawler = DatePopCrawler(location=location, keyword= keyword)

    crawler.search_keyword()
    for page in range(1, 7):
        print("="*10+f"page {page}"+ "="*10)
        crawler.crawling_one_page(page)
        crawler.move_to_next_page()
        print(crawler.data)

Crawling page 0......
Crawling page 1......
Crawling page 2......
Crawling page 3......
Crawling page 4......
Crawling page 5......
===1 번째 매장===
{'name': '대봉집 신사본점', 'category': '육류,고기요리', 'is_food': True, 'instagram_link': 'https://www.instagram.com/', 'instagram_post': None, 'instagram_follower': None, 'visitor_review_count': 647, 'blog_review_count': 607, 'distance_from_subway': '230', 'on_tv': False, 'parking_available': False, 'no_kids': False, 'pet_available': False, 'seoul_michelin': False, 'age-2030': True, 'gender-balance': True, 'on_blue_ribbon': False, 'image_urls': ['https://search.pstatic.net/common/?autoRotate=true&type=w560_sharpen&src=https%3A%2F%2Fnaverbooking-phinf.pstatic.net%2F20221011_152%2F1665459388566HDM7i_JPEG%2Fimage.jpg', 'https://search.pstatic.net/common/?autoRotate=true&type=w560_sharpen&src=https%3A%2F%2Fnaverbooking-phinf.pstatic.net%2F20221011_296%2F1665459681755e2Abv_JPEG%2Fimage.jpg', 'https://search.pstatic.net/common/?autoRotate=true&type=w560_shar

TimeoutException: Message: 


In [22]:
crawler.data

Unnamed: 0,name,category,is_food,instagram_link,instagram_post,instagram_follower,visitor_review_count,blog_review_count,distance_from_subway,on_tv,parking_available,no_kids,pet_available,seoul_michelin,age-2030,gender-balance,on_blue_ribbon,image_urls
0,봄의정원 가로수길점,양식,False,https://www.instagram.com/spring__garden_,,,845,1863,239,False,False,False,False,False,False,True,,[https://search.pstatic.net/common/?autoRotate...
1,따우전드 신사점,"카페,디저트",False,https://www.instagram.com/thousand_official_,,,72,130,578,False,False,False,True,False,False,True,,[https://search.pstatic.net/common/?autoRotate...
2,심퍼티쿠시 가로수길점,양식,False,https://www.instagram.com/szimpatikus.seoul,,,555,1583,184,False,False,False,True,False,True,True,,[https://search.pstatic.net/common/?autoRotate...
3,가로수길,"거리,골목",False,,,,3,5373,454,False,,,,False,False,True,,[https://search.pstatic.net/common/?autoRotate...
4,메리고라운드 스테이크,양식,False,https://www.instagram.com/merry_go_round_steak,,,3162,4993,561,True,True,,,False,False,True,,[https://search.pstatic.net/common/?autoRotate...
5,포사로,이자카야,False,https://www.instagram.com/fosaro.izakaya,,,190,647,295,False,True,,,False,True,True,,[https://search.pstatic.net/common/?autoRotate...
6,페페신사,이탈리아음식,False,https://www.instagram.com/pepe_sinsa,,,288,696,217,False,True,,True,False,True,True,,[https://search.pstatic.net/common/?autoRotate...
7,쇼토,"카페,디저트",False,http://instagram.com/shoto_patisserie,,,760,624,563,False,True,,True,False,True,True,,[https://search.pstatic.net/common/?autoRotate...
8,육회담,이자카야,False,,,,357,163,350,False,True,,True,False,,,,[https://search.pstatic.net/common/?autoRotate...
9,어썸로즈,양식,False,http://www.instagram.com/AwesomeRose_garosu,,,3650,4173,613,False,True,,True,False,,,,[https://search.pstatic.net/common/?autoRotate...
