In [50]:
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from datetime import datetime, timedelta
import time
import csv
import re

# 設置 Selenium 驅動
options = Options()
options.add_argument("--headless")  # 如果需要顯示瀏覽器，請去掉此行
service = Service("/opt/homebrew/bin/chromedriver")  # 指定 ChromeDriver 的路徑
driver = webdriver.Chrome(service=service, options=options)

def scrape_flights(start_date_str, end_date_str):
    # 轉換日期字符串為 datetime 對象
    start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
    end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
    delta = timedelta(days=1)

    # 迴圈遍歷每個日期
    current_date = start_date
    while current_date <= end_date:
        print(f"正在抓取日期: {current_date.strftime('%Y-%m-%d')}")
        
        # URL 
        url = f"https://www.google.com/travel/flights/search?tfs=CBwQAholEgoyMDI0LTEyLTE5KABqDAgCEggvbS8wZnRreHIHCAESA05SVEABSAFwAYIBCwj___________8BmAEC&tfu=EgIIASIA&authuser=0"
        driver.get(url)

        # 定位出發日期選擇器並點擊
        try:
            departure_date_picker = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CLASS_NAME, 'TP4Lpb'))
            )   
            departure_date_picker.click()
            print("成功點擊出發日期選擇器")
            
        except Exception as e:
            print("無法找到出發日期選擇器", e)

        # 確保日曆已展開
        time.sleep(3)  # 等待一段時間以確保日曆加載完成
        
        # 選擇具體日期
        try:
            specific_date = WebDriverWait(driver, 20).until(
                EC.element_to_be_clickable((By.XPATH, f"//div[@class='WhDFk Io4vne' and @data-iso='{current_date.strftime('%Y-%m-%d')}']//div[@role='button']"))
            )
            specific_date.click()  # 點擊具體的日期
            print(f"成功選擇出發日期 {current_date.strftime('%Y 年 %m 月 %d 日')}")

        except Exception as e:
            # 嘗試使用其他 XPath 來選擇日期
            try:
                specific_date = WebDriverWait(driver, 20).until(
                    EC.element_to_be_clickable((By.XPATH, f"//div[@class='WhDFk Io4vne Xu6rJc' and @data-iso='{current_date.strftime('%Y-%m-%d')}']//div[@role='button']"))
                )
                specific_date.click()  # 點擊特定的 12/31 日期
                print(f"成功選擇出發日期 {current_date.strftime('%Y 年 %m 月 %d 日')}")

            except Exception as e:
                try:
                    specific_date = WebDriverWait(driver, 20).until(
                        EC.element_to_be_clickable((By.XPATH, f"//div[@class='WhDFk Io4vne inxqCf' and @data-iso='{current_date.strftime('%Y-%m-%d')}']//div[@role='button']"))
                    )
                    specific_date.click()  # 點擊特定的 01/01 日期
                    print(f"成功選擇出發日期 {current_date.strftime('%Y 年 %m 月 %d 日')}")

                except Exception as e:
                    print(f"無法選擇出發日期 {current_date.strftime('%Y 年 %m 月 %d 日')}", e)
                    current_date += delta  # 如果無法選擇，繼續到下一個日期
                    continue  # 跳過當前迭代，進入下一個日期
                
        # 點擊 "Done" 按鈕
        try:
            done_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, '//div[@class="WXaAwc"]//div//button'))
            )
            done_button.click()  # 點擊 "Done" 按鈕
            print("成功點擊 'Done' 按鈕")

        except Exception as e:
            print("無法找到 'Done' 按鈕", e)

        # 等待一段時間
        time.sleep(3)
        
        # 等待頁面完全加載
        flight_links = WebDriverWait(driver, 20).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "li.pIav2d"))
        )
        print(f"找到 {len(flight_links)} 個航班")

        # 設置一個計數器來記錄抓取成功的航班數量
        success_count = 0

        # 準備寫入 CSV 檔案（覆寫模式）
        with open('google_flights_data_nrt.csv', 'a', newline='', encoding='utf-8') as csv_file:
            csv_writer = csv.writer(csv_file)

            # 寫入標題
            csv_writer.writerow([
                "出發日期", "出發時間", "出發機場代號", 
                "抵達時間", "抵達機場代號", "航空公司", 
                "停靠站數量", "停留時間", "飛行時間", 
                "是否過夜", "機型", "航班代碼", "艙等", "價格歷史"
            ])

            # 遍歷並點擊每個航班列表項，打開新頁面
            for index in range(len(flight_links)):
                # 重新獲取航班連結，防止 StaleElementReferenceException
                flight_links = driver.find_elements(By.CSS_SELECTOR, "li.pIav2d")
                
                # 檢查是否超出範圍
                if index >= len(flight_links):
                    print(f"索引 {index} 超出範圍，停止操作")
                    break
                
                # 點擊
                flight_links[index].click()

                # 等待新頁面加載
                time.sleep(5)

                # 初始化各個欄位
                departure_date, departure_time, arrival_time, departure_airport, arrival_airport = "null", "null", "null", "null", "null"
                airline, layover, layover_time, flight_duration, overnight, aircraft, flight_number, cabin_class = "null", "null", "null", "null", "null", "null", "null", "null"
                
                # 抓取資料
                try:
                    # 抓取出發日期
                    departure_date_element = driver.find_element(By.XPATH, "//span[contains(@class, 'mv1WYe')]").get_attribute("innerHTML")[:9]
                    departure_date = departure_date_element.strip()
                except NoSuchElementException:
                    print("出發日期抓取失敗")

                try:
                    # 抓取出發時間
                    departure_time_element = driver.find_element(By.XPATH, "//div[@class='wtdjmc YMlIz ogfYpf tPgKwe']").get_attribute("aria-label")
                    departure_time = departure_time_element.split("：")[-1].strip()  # 抓取時間部分
                except NoSuchElementException:
                    print("出發時間抓取失敗")

                try:
                    # 抓取抵達時間
                    arrival_time_element = driver.find_element(By.XPATH, "//div[@class='XWcVob YMlIz ogfYpf tPgKwe']").get_attribute("aria-label")
                    arrival_time = arrival_time_element.split("：")[-1].strip()  # 抓取時間部分
                except NoSuchElementException:
                    print("抵達時間抓取失敗")

                try:
                    # 抓取出發和抵達機場代碼
                    airport_elements = driver.find_elements(By.XPATH, "//span[contains(@class, 'qeoz6e HKHSfd')]/following-sibling::span[@dir='ltr']")
                    
                    if len(airport_elements) > 0:
                        departure_airport = airport_elements[0].get_attribute("innerHTML").strip("()")  # 第一個是出發機場
                    else:
                        departure_airport = "未找到出發機場代碼"
                    
                    if len(airport_elements) > 1:
                        arrival_airport = airport_elements[1].get_attribute("innerHTML").strip("()")  # 第二個是抵達機場
                    else:
                        arrival_airport = "未找到抵達機場代碼"
                    
                except NoSuchElementException:
                    print("機場代碼抓取失敗")

                try:
                    # 抓取航空公司
                    airline = driver.find_element(By.XPATH, "//div[contains(@class, 'sSHqwe')]/span[1]").text
                except NoSuchElementException:
                    print("航空公司抓取失敗")

                try:
                    # 抓取行程時間
                    travel_time_element = driver.find_element(By.XPATH, "//div[@class='gvkrdb AdWm1c tPgKwe ogfYpf']").get_attribute("innerHTML")
                    match = re.search(r'(\d+ 小時 \d+ 分鐘)', travel_time_element)
                    flight_duration = match.group(1) if match else "未找到飛行時間"
                except NoSuchElementException:
                    print("飛行時間抓取失敗")

                try:
                    # 抓取停靠站數量
                    layover_element = driver.find_element(By.XPATH, "//div[@class='EfT7Ae AdWm1c tPgKwe']//span[@class='ogfYpf']").get_attribute("aria-label")
                    layover = layover_element.split(" flight.")[0]  # 提取 "1 stop" 或 "Non-stop"
                except NoSuchElementException:
                    layover = "Non-stop"

                if layover != "直達航班。":
                    try:
                        # 抓取停留時間
                        layover_info_element = driver.find_element(By.XPATH, '//div[@class = "tvtJdb eoY5cb y52p7d"]').get_attribute("innerHTML")
                        time_pattern = r'(\d+\s*小時\s*\d+\s*分鐘|\d+\s*分鐘)'
                        match = re.search(time_pattern, layover_info_element)
                        layover_time = match.group(1) if match else "未找到停留時間"
                    except NoSuchElementException:
                        layover_time = "未找到停留時間"
                else:
                    layover_time = "Non-stop"

                try:
                    # 檢查是否有 "Overnight" 元素
                    overnight_element = driver.find_element(By.XPATH, '//div[@class="qj0iCb" and contains(text(), "Overnight")]')
                    overnight = "Yes"
                except NoSuchElementException:
                    overnight = "No"
                
                try:
                    # 抓取機型
                    aircraft = driver.find_element(By.XPATH, '//div[@class="MX5RWe sSHqwe y52p7d"]/span[@class = "Xsgmwe"][last()]').get_attribute("innerHTML")
                except NoSuchElementException:
                    print("機型抓取失敗")

                try:
                    # 抓取航班代碼
                    flight_number_element = driver.find_element(By.XPATH, '//div[@class="MX5RWe sSHqwe y52p7d"]/span[contains(@class, "Xsgmwe")][2]').get_attribute("innerHTML")
                    flight_number = flight_number_element.replace('&nbsp;', ' ').strip()  # 去除前後空白
                except NoSuchElementException:
                    flight_number = "未找到航班代碼"

                try:
                    # 抓取艙等
                    cabin_class = driver.find_element(By.XPATH, '//span[contains(@class, "Xsgmwe")]/div').get_attribute("innerHTML")
                except NoSuchElementException:
                    cabin_class = "未找到艙等"

                # 獲取今天的日期
                today = datetime.today()

                def replace_days_ago_with_date(price_history_text):
                    price_with_date = []
        
                    # 匹配 "60 天前 - $xxx" 格式的數據
                    pattern = r"(\d+)\s*天前\s*-\s*\$([\d,]+)"
                    matches = re.findall(pattern, price_history_text)
        
                    for match in matches:
                        days_ago = int(match[0])
                        price = match[1]
                
                        # 計算具體日期
                        specific_date = today - timedelta(days=days_ago)
                        formatted_date = specific_date.strftime("%m/%d")  # 以 "月/日" 格式顯示
                
                        # 將 "60 天前 - $xxx" 替換為 "月/日 - $xxx"
                        price_with_date.append(f"{formatted_date} - ${price}")
        
                    return ", ".join(price_with_date)

                # 修改價格歷史的部分
                try:
                    elements = driver.find_elements(By.XPATH, "//*[name()='g' and @class='ke9kZe-LkdAo-RbRzK-JNdkSc pKrx3d']")
                    price_history = [element.get_attribute("aria-label") for element in elements]
                    price_history_with_dates = [replace_days_ago_with_date(ph) for ph in price_history]
                except NoSuchElementException:
                    price_history_with_dates = "未找到價格歷史"

                # 將資料寫入 CSV
                csv_writer.writerow([
                    departure_date, departure_time, departure_airport,
                    arrival_time, arrival_airport, airline,
                    layover, layover_time, flight_duration,
                    overnight, aircraft, flight_number, cabin_class,
                    ', '.join(price_history_with_dates)  # 將價格歷史串接為一個字符串
                ])

                # 每次成功抓取航班後，計數器加 1
                success_count += 1

                # 返回上一頁
                driver.back()

                # 等待返回加載完成
                time.sleep(2)

        # 更新當前日期
        current_date += delta

    # 關閉瀏覽器
    driver.quit()

# 設定你想要的起始和結束日期
start_date_input = "2024-12-20"
end_date_input = "2024-12-31"

# 調用函式
scrape_flights(start_date_input, end_date_input)

正在抓取日期: 2024-12-27
成功點擊出發日期選擇器
成功選擇出發日期 2024 年 12 月 27 日
成功點擊 'Done' 按鈕
找到 22 個航班
