In [None]:
!pip install selenium beautifulsoup4 webdriver_manager requests lxml

### Build URL

In [None]:
from datetime import datetime,timedelta

def build_url(time_to_travel: int, length_of_stay: int) -> str:
    today = datetime.today()

    checkin_date = today + timedelta(days=time_to_travel)
    checkout_date = checkin_date + timedelta(days=length_of_stay)
    checkin_date_str = checkin_date.strftime('%Y-%m-%d')
    checkout_date_str = checkout_date.strftime('%Y-%m-%d')
    
    stars_list = ''.join([f'&star={star}' for star  in range(5, 51, 5)])

    url = f'https://www.expedia.com/Hotel-Search?adults=2&children=0&currency=USD&destination=New%20York%20%28and%20vicinity%29%2C%20New%20York%2C%20United%20States%20of%20America&isInvalidatedDate=false&latLong=40.75668%2C-73.98647&mapBounds=&pwaDialog=&regionId=178293&semdtl=&siteid=1&sort=RECOMMENDED&startDate={checkin_date_str}&endDate={checkout_date_str}&theme=&useRewards=false&userIntent={stars_list}'
    return url

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager


def init_driver(headless=True):
    options = webdriver.ChromeOptions()
    
    if headless:
        options.add_argument("--headless=new")  # Ensures modern headless mode
        options.add_argument("--disable-gpu")  # Fixes rendering issues
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
    
    # 🚀 Make the browser appear more human-like
    options.add_argument("--window-size=1920,1080")  # Standard screen size
    options.add_argument("--start-maximized")  # Maximize on launch
    options.add_argument("--disable-blink-features=AutomationControlled")  # Prevent detection
    options.add_experimental_option("excludeSwitches", ["enable-automation"])  # Prevent 'bot' flag
    options.add_experimental_option("useAutomationExtension", False)
    
    # Change User-Agent to a normal browser
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    options.add_argument(f"user-agent={user_agent}")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    # Remove webdriver property to avoid detection
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

    return driver

### Load more hotels

In [None]:
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


def load_more(driver: webdriver.Chrome) -> bool:
    body = driver.find_element(By.TAG_NAME, 'body')
    body.send_keys(Keys.HOME)
    time.sleep(2)
    body.send_keys(Keys.END)

    try:
        # Scroll down to attempt to load more results
        driver.execute_script("window.scrollBy(0, document.body.scrollHeight);")
        time.sleep(1)  # Allow new elements to load
        
        # Find the 'Load more results' button
        button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Show more')]"))
        )
        
        # Scroll directly to the button before clicking (ensures visibility)
        ActionChains(driver).move_to_element(button).perform()
        button.click()
        
        return True  # Successfully clicked the button

    except Exception as e:
        
        print(f"Error loading more results: {e}")
        return False

## Extract Hotels info

#### Extractors

In [None]:
from bs4 import BeautifulSoup
import re
from typing import Dict, Optional, Union

def extract_price(hotel: BeautifulSoup) -> Dict[str, Optional[Union[int, float]]]:
    discounted_price: Optional[Union[int, float]] = None
    original_price: Optional[Union[int, float]] = None

    # Extract discounted price: Look for a div with either "uitk-type-500" or "ausitk-type-500"
    price_elem = hotel.find("div", class_=lambda x: x and ("uitk-type-500" in x or "ausitk-type-500" in x))
    if price_elem:
        price_text = price_elem.get_text(strip=True)
        price_digits = re.sub(r'[^\d.]', '', price_text)
        try:
            if '.' in price_digits:
                discounted_price = float(price_digits)
            else:
                discounted_price = int(price_digits)
        except ValueError:
            discounted_price = None

    # Extract original price from <del> element (if present)
    del_elem = hotel.find("del")
    if del_elem:
        op_text = del_elem.get_text(strip=True)
        op_digits = re.sub(r'[^\d.]', '', op_text)
        try:
            if '.' in op_digits:
                original_price = float(op_digits)
            else:
                original_price = int(op_digits)
        except ValueError:
            original_price = None

    # If original price is missing, default it to the discounted price.
    if original_price is None and discounted_price is not None:
        original_price = discounted_price

    return {
        "discounted_price": discounted_price,
        "original_price": original_price
    }


In [None]:
from typing import Dict, Optional, Union

def extract_review_info(hotel: BeautifulSoup) -> Dict[str, Optional[Union[float, int, str]]]:
    review_score: Optional[float] = None
    review_title: Optional[str] = None
    number_of_reviews: Optional[int] = None

    # Extract review score from a badge element (e.g., <span> with class "uitk-badge-base-text")
    review_badge = hotel.find("span", class_=lambda x: x and "uitk-badge-base-text" in x)
    if review_badge:
        rs_text = review_badge.get_text(strip=True)
        rs_clean = re.sub(r'[^\d.]', '', rs_text)
        try:
            review_score = float(rs_clean)
        except ValueError:
            review_score = None

    # Extract review title (searching for common descriptors)
    review_title_elem = hotel.find("span", string=re.compile(r"(Excellent|Very good|Good)", re.IGNORECASE))
    if review_title_elem:
        review_title = review_title_elem.get_text(strip=True)

    # Extract number of reviews (e.g., text like "1,415 reviews")
    reviews_elem = hotel.find("span", string=re.compile(r"reviews", re.IGNORECASE))
    if reviews_elem:
        rev_text = reviews_elem.get_text(strip=True)
        rev_digits = re.sub(r'\D', '', rev_text)
        try:
            number_of_reviews = int(rev_digits)
        except ValueError:
            number_of_reviews = None

    return {
        "review_score": review_score,
        "review_title": review_title,
        "number_of_reviews": number_of_reviews,
    }

In [None]:
from typing import Optional
import re
from bs4 import BeautifulSoup

def extract_rating(hotel: BeautifulSoup) -> Dict[str, Optional[float]]:
    star_rating: Optional[float] = None
    rating_div = hotel.find("div", class_=lambda x: x and "uitk-rating" in x)
    if rating_div:
        hidden_rating = rating_div.find("span", class_="is-visually-hidden")
        if hidden_rating:
            rating_text = hidden_rating.get_text(strip=True)
            match = re.search(r"(\d+(\.\d+)?)", rating_text)
            if match:
                try:
                    star_rating = float(match.group(1))
                except ValueError:
                    star_rating = None
    return {
        "star_rating": star_rating
        }


In [None]:
from typing import Dict, Optional
from bs4 import BeautifulSoup

def extract_neighborhood(hotel: BeautifulSoup) -> Dict[str, Optional[str]]:
    neighborhood_elem = hotel.find("div", class_=lambda x: x and "truncate-lines-2" in x)
    neighborhood = neighborhood_elem.get_text(strip=True) if neighborhood_elem else None
    return {
        "neighborhood": neighborhood
        }


In [None]:
from typing import Dict, Optional
import re
from bs4 import BeautifulSoup

def extract_booking_options(hotel: BeautifulSoup) -> Dict[str, Optional[bool]]:
    breakfast_included: bool = False
    free_cancellation: bool = False
    prepayment_needed: Optional[bool] = None

    # Check for breakfast included by looking for tags containing both "breakfast" and "included"
    breakfast_elem = hotel.find(lambda tag: tag.name in ['div', 'span'] and 
                                  "breakfast" in tag.get_text(strip=True).lower() and 
                                  "included" in tag.get_text(strip=True).lower())
    if breakfast_elem:
        breakfast_included = True

    # Check for free cancellation: look for "fully refundable" anywhere in the text
    if hotel.find(string=re.compile(r"fully refundable", re.IGNORECASE)):
        free_cancellation = True

    # Determine prepayment needed:
    # If "reserve now, pay later" is found then prepayment is not needed (False).
    # If "prepayment required" or "prepayment needed" is found then prepayment is needed (True).
    if hotel.find(string=re.compile(r"reserve now, pay later", re.IGNORECASE)):
        prepayment_needed = False
    elif hotel.find(string=re.compile(r"prepayment (required|needed)", re.IGNORECASE)):
        prepayment_needed = True

    return {
        "breakfast_included": breakfast_included,
        "free_cancellation": free_cancellation,
        "prepayment_needed": prepayment_needed,
    }


In [None]:
from typing import Dict, Optional
from bs4 import BeautifulSoup

def extract_name(hotel: BeautifulSoup) -> Dict[str, Optional[str]]:
    name_elem = hotel.find("h3", class_=lambda x: x and "uitk-heading" in x)
    name = name_elem.get_text(strip=True) if name_elem else None
    return {"name": name}


#### Aggragator

In [None]:
def extract_hotel_info(hotel: BeautifulSoup) -> Dict[str, Optional[Union[str, int, float, bool]]]:
    extractors = [
        extract_name,
        extract_price,
        extract_rating,
        extract_review_info,
        extract_neighborhood,
        extract_booking_options,
    ]
    
    # Merge all dictionaries returned by each extractor function using dictionary unpacking.
    return {**{k: v for extractor in extractors for k, v in extractor(hotel).items()}}

# Scrape hotels info

In [None]:
from bs4 import BeautifulSoup
import time
from selenium import webdriver

def scrape_hotels(driver: webdriver.Chrome) -> list:
    hotels = []
    
    while len(hotels) < 100:
        soup = BeautifulSoup(driver.page_source, 'lxml')
        hotels = soup.find_all('div', {'data-stid': 'lodging-card-responsive'})
    
        time.sleep(2)
        load_more(driver=driver)
        time.sleep(2)
        print(len(hotels))
    
    
    return list(map(extract_hotel_info, hotels))


In [None]:
build_url(1, 3)

In [None]:
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
from datetime import datetime
import os
import math

SNAPSHOT_DATE = datetime.today().strftime('%Y-%m-%d')

number_of_cores = os.cpu_count()
max_workers = math.ceil(number_of_cores/4)


def scrape_hotels_thread(TTT, LOS):
    """Scrape hotel data using a shared WebDriver with multiple tabs."""
    print(f'scraping TTT={TTT} / 30, LOS={LOS} / 5')
    try:
        url = build_url(TTT, LOS)

        driver = init_driver(headless=True)  
        driver.get(url)
        time.sleep(2)

        hotels = scrape_hotels(driver)
        hotels_df = pd.DataFrame(hotels)
        hotels_df['time_to_travel'] = TTT
        hotels_df['length_of_stay'] = LOS
        hotels_df['snapshot_date'] = SNAPSHOT_DATE

        return hotels_df
    
    except Exception as e:
        print(f"Error scraping TTT={TTT}, LOS={LOS}: {e}")
        return pd.DataFrame()  # Return empty DataFrame in case of failure
    finally:
        if driver:
            driver.quit()

all_combinations = [(TTT, LOS) for TTT in range(1, 31) for LOS in range(1, 6)]

# Run threads, each using a new tab in the same browser
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    results = list(executor.map(lambda args: scrape_hotels_thread(*args), all_combinations))
    df = pd.concat(results, ignore_index=True)
    df.to_csv(f'expedia_snapshot_{SNAPSHOT_DATE}.csv', index=False)