In [1]:
%pip install selenium beautifulsoup4 webdriver_manager requests lxml

Note: you may need to restart the kernel to use updated packages.


In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager


def init_driver(headless=True):
    options = webdriver.ChromeOptions()
    
    if headless:
        options.add_argument("--headless=new")  # Ensures modern headless mode
        options.add_argument("--disable-gpu")  # Fixes rendering issues
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
    
    # 🚀 Make the browser appear more human-like
    options.add_argument("--window-size=1920,1080")  # Standard screen size
    options.add_argument("--start-maximized")  # Maximize on launch
    options.add_argument("--disable-blink-features=AutomationControlled")  # Prevent detection
    options.add_experimental_option("excludeSwitches", ["enable-automation"])  # Prevent 'bot' flag
    options.add_experimental_option("useAutomationExtension", False)
    
    # Change User-Agent to a normal browser
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    options.add_argument(f"user-agent={user_agent}")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    # Remove webdriver property to avoid detection
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

    return driver

In [3]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


def scroll_down(driver: webdriver.Chrome) -> bool:
    body = driver.find_element(By.TAG_NAME, 'body')
    last_height = driver.execute_script("""
    var body = document.body,
    html = document.documentElement;

    var height = Math.max( body.scrollHeight, body.offsetHeight, 
    html.clientHeight, html.scrollHeight, html.offsetHeight )
    
    return height;""")
    
    while True:
        body.send_keys(Keys.END)
        time.sleep(1)
        body.send_keys(Keys.HOME)
        time.sleep(1)
        new_height = driver.execute_script("""
    var body = document.body,
    html = document.documentElement;

    var height = Math.max( body.scrollHeight, body.offsetHeight, 
    html.clientHeight, html.scrollHeight, html.offsetHeight )
    
    return height;""")
        
        if new_height == last_height:
            # Reached the end of the page
            return True
        last_height = new_height

In [4]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def next_page(driver: webdriver.Chrome):
    try:
        button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH,'//button[@data-selenium="pagination-next-btn"]'))
        )
        
        ActionChains(driver).move_to_element(button).perform()
        button.click()
        
        time.sleep(5)
        return True  # Successfully clicked the button

    except Exception as e:
        
        print(f"Error loading more results: {e}")
        return False

In [5]:
from bs4 import BeautifulSoup
from typing import Dict, Optional

def extract_name(hotel: BeautifulSoup) -> Dict[str, Optional[str]]:
    name_tag = hotel.find(attrs={"data-selenium": "hotel-name"})
    name: Optional[str] = name_tag.get_text(strip=True) if name_tag else None
    return {"name": name}


In [6]:
from bs4 import BeautifulSoup
from typing import Dict, Optional
import re

def extract_price(hotel: BeautifulSoup) -> Dict[str, Optional[int]]:
    final_price_tag = hotel.find(attrs={"data-element-name": "final-price"})
    if final_price_tag:
        final_text = final_price_tag.get_text(strip=True)
        final_match = re.search(r'(\d+)', final_text.replace(',', ''))
        discounted_price: Optional[int] = int(final_match.group(1)) if final_match else None
    else:
        discounted_price = None

    original_price_tag = hotel.find(class_=re.compile(r"PropertyCardPrice--(CrossOut|Original)"))
    if original_price_tag:
        orig_text = original_price_tag.get_text(strip=True)
        orig_match = re.search(r'(\d+)', orig_text.replace(',', ''))
        original_price: Optional[int] = int(orig_match.group(1)) if orig_match else discounted_price
    else:
        original_price = discounted_price

    return {"discounted_price": discounted_price, "original_price": original_price}


In [7]:
from bs4 import BeautifulSoup
from typing import Dict, Optional
import re

def extract_rating(hotel: BeautifulSoup) -> Dict[str, Optional[int]]:
    star_tag = hotel.find("span", string=lambda t: t and "stars out of" in t)
    if star_tag:
        star_text = star_tag.get_text(strip=True)
        match = re.search(r'(\d+)\s*stars', star_text)
        star_rating: Optional[int] = int(match.group(1)) if match else None
    else:
        star_rating = None
    return {"star_rating": star_rating}


In [8]:
from bs4 import BeautifulSoup
from typing import Dict, Optional, Union
import re

def extract_review_info(hotel: BeautifulSoup) -> Dict[str, Optional[Union[float, str, int]]]:
    review_container = hotel.find(attrs={"data-element-name": "property-card-review"})
    review_score: Optional[float] = None
    review_title: Optional[str] = None
    number_of_reviews: Optional[int] = None
    if review_container:
        p_review = review_container.find("p", class_=re.compile("sc-jrAGrp"))
        if p_review:
            spans = p_review.find_all("span")
            if len(spans) >= 2:
                try:
                    review_score = float(spans[0].get_text(strip=True))
                except ValueError:
                    review_score = None
                review_title = spans[1].get_text(strip=True)
        for p in review_container.find_all("p", class_=re.compile("sc-jrAGrp")):
            if "reviews" in p.get_text():
                num_match = re.search(r'([\d,]+)', p.get_text())
                if num_match:
                    number_of_reviews = int(num_match.group(1).replace(',', ''))
                    break
    return {
        "review_score": review_score,
        "review_title": review_title,
        "number_of_reviews": number_of_reviews
    }


In [9]:
from bs4 import BeautifulSoup
from typing import Dict, Optional, Union
import re

def extract_neighborhood(hotel: BeautifulSoup) -> Dict[str, Optional[Union[str, float]]]:
    neighborhood_tag = hotel.find(attrs={"data-selenium": "area-city-text"})
    raw_text = neighborhood_tag.get_text(strip=True) if neighborhood_tag else ""
    neighborhood: Optional[str] = raw_text.split(',')[0].strip() if raw_text else None
    match = re.search(r'([\d\.]+)\s*(km|mi)', raw_text)
    if match:
        distance = float(match.group(1))
        if match.group(2) == "mi":
            distance *= 1.60934
    else:
        distance = 0.0
    return {"neighborhood": neighborhood, "distance_from_center": distance}


In [10]:
from bs4 import BeautifulSoup
from typing import Dict, Optional

def extract_booking_options(hotel: BeautifulSoup) -> Dict[str, Optional[bool]]:
    a_tag = hotel.find("a")
    free_cancellation: Optional[bool] = None
    prepayment_needed: Optional[bool] = None
    if a_tag:
        free_cancellation = a_tag.get("data-is-free-cancellation", "false").lower() == "true"
        prepayment_needed = not (a_tag.get("data-is-pay-at-hotel", "false").lower() == "true")
    breakfast_included: bool = False
    pill_items = hotel.find_all(attrs={"data-element-name": "pill-each-item"})
    for item in pill_items:
        if "Breakfast" in item.get_text(strip=True):
            breakfast_included = True
            break
    return {"free_cancellation": free_cancellation, "prepayment_needed": prepayment_needed, "breakfast_included": breakfast_included}


In [11]:
def extract_hotel_info(hotel: BeautifulSoup) -> Dict[str, Optional[Union[str, int, float, bool]]]:
    extractors = [
        extract_name,
        extract_price,
        extract_rating,
        extract_review_info,
        extract_neighborhood,
        extract_booking_options,
    ]
    
    # Merge all dictionaries returned by each extractor function using dictionary unpacking.
    return {**{k: v for extractor in extractors for k, v in extractor(hotel).items()}}

In [12]:
from bs4 import BeautifulSoup
import time
from selenium import webdriver

def scrape_hotels(driver: webdriver.Chrome) -> list:
    hotels = []
    last_hotels = 0
    
    while len(hotels) < 150:
        soup = BeautifulSoup(driver.page_source, 'lxml')
        hotels_in_page = soup.find_all('li', {'data-selenium': 'hotel-item'})
        
    
        should_click =len(hotels_in_page) == last_hotels
        
        if should_click:
            hotels.extend(hotels_in_page)
            next_page(driver)
            hotels_in_page = []
            last_hotels = 0
        else:
            scroll_down(driver)
            last_hotels = len(hotels_in_page)
            

    
    
    return list(map(extract_hotel_info, hotels))


In [13]:
from datetime import datetime,timedelta

def build_url(time_to_travel: int, length_of_stay: int) -> str:
    today = datetime.today()

    checkin_date = today + timedelta(days=time_to_travel)
    checkout_date = checkin_date + timedelta(days=length_of_stay)
    checkin_date_str = checkin_date.strftime('%Y-%m-%d')
    checkout_date_str = checkout_date.strftime('%Y-%m-%d')
    

    url = f'https://www.agoda.com/search?city=318&locale=en-us&ckuid=380978a5-bb4e-4c74-8baa-cd2cb4eb261b&prid=0&gclid=CjwKCAiArKW-BhAzEiwAZhWsIOVy3SeO0urO2mElpzEs9PxEBpHeM4n0s8s-pAFk-jwUfQcEDggvCxoC0IoQAvD_BwE&currency=USD&correlationId=310664ad-8b23-4176-8483-6ce5da360773&analyticsSessionId=-305323959182569045&pageTypeId=103&realLanguageId=1&languageId=1&origin=IL&stateCode=TA&cid=1922880&tag=3cde7f80-a923-450c-a416-936443e59e6c&userId=380978a5-bb4e-4c74-8baa-cd2cb4eb261b&whitelabelid=1&loginLvl=0&storefrontId=3&currencyId=7&currencyCode=USD&htmlLanguage=en-us&cultureInfoName=en-us&machineName=am-pc-4i-acm-web-user-d8bb4bfd-44gwt&trafficGroupId=5&trafficSubGroupId=122&aid=82361&useFullPageLogin=true&cttp=4&isRealUser=false&mode=production&browserFamily=Chrome&cdnDomain=agoda.net&checkIn={checkin_date_str}&checkOut={checkout_date_str}&rooms=1&adults=2&children=0&priceCur=USD&los={length_of_stay}&textToSearch=New+York+%28NY%29&travellerType=1&familyMode=off&ds=ZCR2R6MELCVEiFmP&productType=-1' 
    return url

In [15]:
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
from datetime import datetime
import os
import math
import signal

SNAPSHOT_DATE = datetime.today().strftime('%Y-%m-%d')

number_of_cores = os.cpu_count()
max_workers = math.ceil(number_of_cores/5)


def scrape_hotels_thread(TTT, LOS):
    """Scrape hotel data using a shared WebDriver with multiple tabs."""
    print(f'scraping TTT={TTT} / 30, LOS={LOS} / 5')
    
    if (os.path.exists(f'agoda/{SNAPSHOT_DATE}_TTT_{TTT}_LOS_{LOS}.csv')):
        return pd.read_csv(f'agoda/{SNAPSHOT_DATE}_TTT_{TTT}_LOS_{LOS}.csv')
    
    url = build_url(TTT, LOS)
    driver = init_driver(headless=True)  
    driver.get(url)
    try:
        time.sleep(2)

        hotels = scrape_hotels(driver)
        hotels_df = pd.DataFrame(hotels)
        hotels_df['time_to_travel'] = TTT
        hotels_df['length_of_stay'] = LOS
        hotels_df['snapshot_date'] = SNAPSHOT_DATE

        hotels_df.to_csv(f'agoda/{SNAPSHOT_DATE}_TTT_{TTT}_LOS_{LOS}.csv', index=False)

        return hotels_df
    
    except Exception as e:
        print(f"Error scraping TTT={TTT}, LOS={LOS}: {e}")
        return pd.DataFrame()  # Return empty DataFrame in case of failure
    finally:
        if driver:
            driver.quit()
            
    # add graceful exit for the driver

all_combinations = [(TTT, LOS) for TTT in range(1, 31) for LOS in range(1, 6)]

# Run threads, each using a new tab in the same browser
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    results = list(executor.map(lambda args: scrape_hotels_thread(*args), all_combinations))
    df = pd.concat(results, ignore_index=True)
    df.drop_duplicates(inplace=True)  # Filter out duplicates
    df.to_csv(f'agoda_snapshot_{SNAPSHOT_DATE}.csv', index=False)

scraping TTT=1 / 30, LOS=1 / 5
scraping TTT=1 / 30, LOS=2 / 5
scraping TTT=1 / 30, LOS=3 / 5
scraping TTT=1 / 30, LOS=4 / 5
scraping TTT=1 / 30, LOS=5 / 5
scraping TTT=2 / 30, LOS=1 / 5
scraping TTT=2 / 30, LOS=2 / 5
scraping TTT=2 / 30, LOS=3 / 5
scraping TTT=2 / 30, LOS=4 / 5
scraping TTT=2 / 30, LOS=5 / 5
scraping TTT=3 / 30, LOS=1 / 5
scraping TTT=3 / 30, LOS=2 / 5
scraping TTT=3 / 30, LOS=3 / 5
scraping TTT=3 / 30, LOS=4 / 5
scraping TTT=3 / 30, LOS=5 / 5
scraping TTT=4 / 30, LOS=1 / 5
scraping TTT=4 / 30, LOS=2 / 5
scraping TTT=4 / 30, LOS=3 / 5
scraping TTT=4 / 30, LOS=4 / 5
scraping TTT=4 / 30, LOS=5 / 5
scraping TTT=5 / 30, LOS=1 / 5
scraping TTT=5 / 30, LOS=2 / 5
scraping TTT=5 / 30, LOS=3 / 5
scraping TTT=5 / 30, LOS=4 / 5
scraping TTT=5 / 30, LOS=5 / 5
scraping TTT=6 / 30, LOS=1 / 5
scraping TTT=6 / 30, LOS=2 / 5
scraping TTT=6 / 30, LOS=3 / 5
scraping TTT=6 / 30, LOS=4 / 5
scraping TTT=6 / 30, LOS=5 / 5
scraping TTT=7 / 30, LOS=1 / 5
scraping TTT=7 / 30, LOS=2 / 5
scraping