In [None]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import re
import urllib.parse

class RestaurantRatingCrawler:
    def __init__(self, use_selenium=True):
        """음식점 평점 크롤러 초기화"""
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        
        self.driver = None
        if use_selenium:
            self.setup_selenium()
    
    def setup_selenium(self):
        """Selenium 드라이버 설정"""
        try:
            chrome_options = Options()
            chrome_options.add_argument('--headless')
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--disable-dev-shm-usage')
            chrome_options.add_argument('--disable-gpu')
            chrome_options.add_argument('--window-size=1920,1080')
            chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
            
            self.driver = webdriver.Chrome(options=chrome_options)
            self.driver.implicitly_wait(10)
            print("Selenium 드라이버 초기화 완료")
        except Exception as e:
            print(f"Selenium 설정 실패: {e}")
    
    # 1. 네이버 플레이스 크롤링
    def crawl_naver_place(self, search_keyword, max_results=50):
        """네이버 플레이스에서 음식점 정보 크롤링"""
        restaurants = []
        
        try:
            if not self.driver:
                print("Selenium 드라이버가 필요합니다.")
                return restaurants
            
            # 네이버 지도 검색
            search_url = f"https://map.naver.com/p/search/{urllib.parse.quote(search_keyword)}"
            self.driver.get(search_url)
            time.sleep(3)
            
            # 검색 결과 프레임으로 전환
            try:
                search_iframe = WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.ID, "searchIframe"))
                )
                self.driver.switch_to.frame(search_iframe)
            except:
                print("검색 프레임을 찾을 수 없습니다.")
                return restaurants
            
            # 결과 수집
            collected = 0
            scroll_count = 0
            max_scroll = 10
            
            while collected < max_results and scroll_count < max_scroll:
                # 음식점 리스트 요소들 찾기
                restaurant_elements = self.driver.find_elements(By.CSS_SELECTOR, "li[data-id]")
                
                for element in restaurant_elements[collected:]:
                    try:
                        restaurant_info = self.extract_naver_restaurant_info(element)
                        if restaurant_info:
                            restaurants.append(restaurant_info)
                            collected += 1
                            
                        if collected >= max_results:
                            break
                    except Exception as e:
                        print(f"네이버 플레이스 정보 추출 실패: {e}")
                        continue
                
                # 스크롤하여 더 많은 결과 로드
                self.driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", 
                                         self.driver.find_element(By.CSS_SELECTOR, ".scroll_box"))
                time.sleep(2)
                scroll_count += 1
            
            self.driver.switch_to.default_content()
            
        except Exception as e:
            print(f"네이버 플레이스 크롤링 오류: {e}")
        
        return restaurants
    
    def extract_naver_restaurant_info(self, element):
        """네이버 플레이스 음식점 정보 추출"""
        try:
            # 음식점명
            name_elem = element.find_element(By.CSS_SELECTOR, ".place_bluelink")
            name = name_elem.text.strip()
            
            # 평점
            try:
                rating_elem = element.find_element(By.CSS_SELECTOR, ".rating_real")
                rating = rating_elem.text.strip()
            except:
                rating = "N/A"
            
            # 리뷰 수
            try:
                review_elem = element.find_element(By.CSS_SELECTOR, ".rating_real + span")
                review_count = review_elem.text.strip()
            except:
                review_count = "N/A"
            
            # 카테고리
            try:
                category_elem = element.find_element(By.CSS_SELECTOR, ".category")
                category = category_elem.text.strip()
            except:
                category = "N/A"
            
            # 주소
            try:
                address_elem = element.find_element(By.CSS_SELECTOR, ".addr")
                address = address_elem.text.strip()
            except:
                address = "N/A"
            
            return {
                'name': name,
                'rating': rating,
                'review_count': review_count,
                'category': category,
                'address': address,
                'source': 'naver'
            }
            
        except Exception as e:
            print(f"네이버 정보 추출 실패: {e}")
            return None
    
    
    # 3. 식신 크롤링
    def crawl_siksinhot(self, search_keyword, max_results=30):
        """식신 사이트 크롤링"""
        restaurants = []
        
        try:
            if not self.driver:
                print("Selenium 드라이버가 필요합니다.")
                return restaurants
            
            # 식신 검색 페이지
            search_url = f"https://www.siksinhot.com/search?keywords={urllib.parse.quote(search_keyword)}"
            self.driver.get(search_url)
            time.sleep(3)
            
            # 음식점 리스트 수집
            collected = 0
            while collected < max_results:
                restaurant_elements = self.driver.find_elements(By.CSS_SELECTOR, ".item_list li")
                
                for element in restaurant_elements[collected:]:
                    try:
                        restaurant_info = self.extract_siksinhot_info(element)
                        if restaurant_info:
                            restaurants.append(restaurant_info)
                            collected += 1
                            
                        if collected >= max_results:
                            break
                    except Exception as e:
                        print(f"식신 정보 추출 실패: {e}")
                        continue
                
                # 더보기 버튼 클릭 또는 페이지 이동
                try:
                    more_button = self.driver.find_element(By.CSS_SELECTOR, ".btn_more")
                    if more_button.is_displayed():
                        more_button.click()
                        time.sleep(2)
                    else:
                        break
                except:
                    break
            
        except Exception as e:
            print(f"식신 크롤링 오류: {e}")
        
        return restaurants
    
    def extract_siksinhot_info(self, element):
        """식신 음식점 정보 추출"""
        try:
            name_elem = element.find_element(By.CSS_SELECTOR, ".store_name")
            name = name_elem.text.strip()
            
            try:
                rating_elem = element.find_element(By.CSS_SELECTOR, ".rate_point")
                rating = rating_elem.text.strip()
            except:
                rating = "N/A"
            
            try:
                category_elem = element.find_element(By.CSS_SELECTOR, ".category")
                category = category_elem.text.strip()
            except:
                category = "N/A"
            
            try:
                address_elem = element.find_element(By.CSS_SELECTOR, ".address")
                address = address_elem.text.strip()
            except:
                address = "N/A"
            
            return {
                'name': name,
                'rating': rating,
                'category': category,
                'address': address,
                'source': 'siksinhot'
            }
            
        except Exception as e:
            print(f"식신 정보 추출 실패: {e}")
            return None
    
    # 4. 카카오맵 크롤링
    def crawl_kakao_map(self, search_keyword, max_results=40):
        """카카오맵에서 음식점 정보 크롤링"""
        restaurants = []
        
        try:
            if not self.driver:
                print("Selenium 드라이버가 필요합니다.")
                return restaurants
            
            # 카카오맵 검색
            search_url = f"https://map.kakao.com/?q={urllib.parse.quote(search_keyword)}"
            self.driver.get(search_url)
            time.sleep(3)
            
            collected = 0
            page = 1
            
            while collected < max_results and page <= 5:  # 최대 5페이지
                # 검색 결과 요소들 찾기
                restaurant_elements = self.driver.find_elements(By.CSS_SELECTOR, ".placelist > .PlaceItem")
                
                for element in restaurant_elements:
                    try:
                        restaurant_info = self.extract_kakao_restaurant_info(element)
                        if restaurant_info:
                            restaurants.append(restaurant_info)
                            collected += 1
                            
                        if collected >= max_results:
                            break
                    except Exception as e:
                        print(f"카카오맵 정보 추출 실패: {e}")
                        continue
                
                # 다음 페이지로 이동
                try:
                    next_button = self.driver.find_element(By.ID, f"info\\.search\\.page\\.next")
                    if next_button.is_enabled():
                        next_button.click()
                        time.sleep(2)
                        page += 1
                    else:
                        break
                except:
                    break
            
        except Exception as e:
            print(f"카카오맵 크롤링 오류: {e}")
        
        return restaurants
    
    def extract_kakao_restaurant_info(self, element):
        """카카오맵 음식점 정보 추출"""
        try:
            name_elem = element.find_element(By.CSS_SELECTOR, ".link_name")
            name = name_elem.text.strip()
            
            try:
                rating_elem = element.find_element(By.CSS_SELECTOR, ".rating .num")
                rating = rating_elem.text.strip()
            except:
                rating = "N/A"
            
            try:
                category_elem = element.find_element(By.CSS_SELECTOR, ".subcategory")
                category = category_elem.text.strip()
            except:
                category = "N/A"
            
            try:
                address_elem = element.find_element(By.CSS_SELECTOR, ".addr")
                address = address_elem.text.strip()
            except:
                address = "N/A"
            
            return {
                'name': name,
                'rating': rating,
                'category': category,
                'address': address,
                'source': 'kakaomap'
            }
            
        except Exception as e:
            print(f"카카오맵 정보 추출 실패: {e}")
            return None
    
    # 데이터 통합 및 저장
    def crawl_all_sources(self, search_keyword, max_results_per_source=30):
        """모든 소스에서 데이터 수집"""
        all_restaurants = []
        
        print("=== 네이버 플레이스 크롤링 시작 ===")
        naver_data = self.crawl_naver_place(search_keyword, max_results_per_source)
        all_restaurants.extend(naver_data)
        print(f"네이버 플레이스: {len(naver_data)}개 수집")
        
        print("=== 카카오맵 크롤링 시작 ===")
        kakao_data = self.crawl_kakao_map(search_keyword, max_results_per_source)
        all_restaurants.extend(kakao_data)
        print(f"카카오맵: {len(kakao_data)}개 수집")
        
        print("=== 식신 크롤링 시작 ===")
        siksinhot_data = self.crawl_siksinhot(search_keyword, max_results_per_source)
        all_restaurants.extend(siksinhot_data)
        print(f"식신: {len(siksinhot_data)}개 수집")
                
        return all_restaurants
    
    def remove_duplicates(self, restaurants):
        """중복 음식점 제거"""
        seen_names = set()
        unique_restaurants = []
        
        for restaurant in restaurants:
            name_normalized = re.sub(r'[^\w가-힣]', '', restaurant['name']).lower()
            if name_normalized not in seen_names:
                seen_names.add(name_normalized)
                unique_restaurants.append(restaurant)
        
        return unique_restaurants
    
    def save_to_csv(self, data, filename="data/restaurant_ratings.csv"):
        """CSV 파일로 저장"""
        df = pd.DataFrame(data)
        df.to_csv(filename, index=False, encoding='utf-8-sig')
        print(f"데이터가 {filename}에 저장되었습니다.")
    
    def save_to_json(self, data, filename="data/restaurant_ratings.json"):
        """JSON 파일로 저장"""
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        print(f"데이터가 {filename}에 저장되었습니다.")
    
    def analyze_ratings(self, restaurants):
        """평점 데이터 분석"""
        df = pd.DataFrame(restaurants)
        
        # 평점을 숫자로 변환
        def extract_rating(rating_str):
            if rating_str == "N/A":
                return None
            
            # 숫자만 추출
            numbers = re.findall(r'\d+\.?\d*', str(rating_str))
            if numbers:
                return float(numbers[0])
            return None
        
        df['rating_numeric'] = df['rating'].apply(extract_rating)
        
        # 통계 정보 출력
        print("\n=== 평점 데이터 분석 ===")
        print(f"전체 음식점 수: {len(df)}")
        print(f"평점 데이터가 있는 음식점: {df['rating_numeric'].notna().sum()}개")
        
        if df['rating_numeric'].notna().any():
            print(f"평균 평점: {df['rating_numeric'].mean():.2f}")
            print(f"최고 평점: {df['rating_numeric'].max():.2f}")
            print(f"최저 평점: {df['rating_numeric'].min():.2f}")
            
            # 소스별 통계
            print("\n=== 소스별 통계 ===")
            source_stats = df.groupby('source')['rating_numeric'].agg(['count', 'mean']).round(2)
            print(source_stats)
        
        return df
    
    def close(self):
        """리소스 정리"""
        if self.driver:
            self.driver.quit()
            print("드라이버가 종료되었습니다.")

# 메인 실행 함수
def main():
    """메인 실행 함수"""
    crawler = RestaurantRatingCrawler()
    
    try:
        # 검색 키워드 설정
        search_keyword = "강남 맛집"  # 원하는 검색어로 변경
        
        print(f"'{search_keyword}' 검색 시작...")
        
        # 모든 소스에서 데이터 수집
        all_restaurants = crawler.crawl_all_sources(search_keyword, max_results_per_source=20)
        
        # 중복 제거
        unique_restaurants = crawler.remove_duplicates(all_restaurants)
        
        print(f"\n총 {len(all_restaurants)}개 수집, 중복 제거 후 {len(unique_restaurants)}개")
        
        # 결과 미리보기
        print("\n=== 수집된 음식점 예시 ===")
        for i, restaurant in enumerate(unique_restaurants[:5]):
            print(f"\n{i+1}. {restaurant['name']}")
            print(f"   평점: {restaurant['rating']}")
            print(f"   카테고리: {restaurant['category']}")
            print(f"   주소: {restaurant['address']}")
            print(f"   출처: {restaurant['source']}")
        
        # 데이터 분석
        df = crawler.analyze_ratings(unique_restaurants)
        
        # 데이터 저장
        if unique_restaurants:
            crawler.save_to_csv(unique_restaurants)
            crawler.save_to_json(unique_restaurants)
        
    except Exception as e:
        print(f"크롤링 실행 중 오류: {e}")
    
    finally:
        crawler.close()

if __name__ == "__main__":
    main()

# 간단한 사용 예시
"""
# 특정 소스만 사용하는 경우
crawler = RestaurantRatingCrawler()

# 네이버 플레이스만 크롤링
naver_restaurants = crawler.crawl_naver_place("홍대 맛집", 30)

# 카카오맵만 크롤링  
kakao_restaurants = crawler.crawl_kakao_map("이태원 맛집", 30)

# 결과 저장
all_data = naver_restaurants + kakao_restaurants
crawler.save_to_csv(all_data, "my_restaurants.csv")

crawler.close()
"""