In [11]:
# Core imports
import pandas as pd
import numpy as np
import requests
import time
import random
import re
from datetime import datetime, timedelta

# Selenium imports
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import (
    TimeoutException, 
    NoSuchElementException,
    StaleElementReferenceException
)

# For automatic chromedriver management
from webdriver_manager.chrome import ChromeDriverManager

# For parsing HTML
from bs4 import BeautifulSoup

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ All imports successful!")

‚úÖ All imports successful!


In [12]:
# Group configuration for Group 1
EVENT_NAME = "Las Fallas"
TREATMENT_CITY = "Valencia"
CONTROL_CITY = "Alicante"

# Date configuration
TREATMENT_CHECKIN = datetime(2026, 3, 14)  # Event week 
CONTROL_CHECKINS = [
    datetime(2026, 2, 28),  # 2 weeks before
    datetime(2026, 3, 7),  # 1 week before
    datetime(2026, 3, 21),   # 1 week after
    datetime(2026, 3, 28),  # 2 weeks after
]

# Booking.com fixed search parameters
NIGHTS = 7
ADULTS = 2
ROOMS = 1
CURRENCY = "EUR"

In [13]:
print("üìã GROUP 1 SCRAPING PLAN")
print("="*50)
print(f"Event: {EVENT_NAME}")
print(f"Treatment City: {TREATMENT_CITY}")
print(f"Control City: {CONTROL_CITY}")
print(f"\nDates to scrape:")
print(f"  Treatment period: {TREATMENT_CHECKIN.strftime('%Y-%m-%d')} (EVENT WEEK)")
for date in CONTROL_CHECKINS:
    print(f"  Control period:   {date.strftime('%Y-%m-%d')}")
print(f"\nSearch parameters: {NIGHTS} nights, {ADULTS} adults, {ROOMS} room")
print("="*50)

üìã GROUP 1 SCRAPING PLAN
Event: Las Fallas
Treatment City: Valencia
Control City: Alicante

Dates to scrape:
  Treatment period: 2026-03-14 (EVENT WEEK)
  Control period:   2026-02-28
  Control period:   2026-03-07
  Control period:   2026-03-21
  Control period:   2026-03-28

Search parameters: 7 nights, 2 adults, 1 room


In [14]:
# ‚úÖ MUST INCLUDE: Function to generate Booking.com URLs

def generate_booking_url(city, checkin_date, nights=7, adults=2, rooms=1):
    """
    Generate a Booking.com search URL with specific parameters.
    
    Args:
        city (str): City name (e.g., "Barcelona")
        checkin_date (datetime): Check-in date
        nights (int): Number of nights to stay
        adults (int): Number of adults
        rooms (int): Number of rooms
    
    Returns:
        str: Full Booking.com search URL
    """
    checkout_date = checkin_date + timedelta(days=nights)
    
    # Format dates as YYYY-MM-DD
    checkin_str = checkin_date.strftime("%Y-%m-%d")
    checkout_str = checkout_date.strftime("%Y-%m-%d")
    
    # Build URL with all parameters
    base_url = "https://www.booking.com/searchresults.html"
    params = f"?ss={city}&checkin={checkin_str}&checkout={checkout_str}"
    params += f"&group_adults={adults}&no_rooms={rooms}&group_children=0"
    
    return base_url + params

# Test it!
test_url = generate_booking_url("Valencia", datetime(2026, 3, 14))
print(test_url)

https://www.booking.com/searchresults.html?ss=Valencia&checkin=2026-03-14&checkout=2026-03-21&group_adults=2&no_rooms=1&group_children=0


Booking.com URLs encode search parameters:

- ss=Barcelona ‚Üí Search string (city name)
- checkin=2026-06-02 ‚Üí Check-in date
- checkout=2026-06-09 ‚Üí Calculated from nights
- group_adults=2 ‚Üí Number of adults

The Hook: URLs are like function calls to websites! The ? starts parameters, & separates them.

URL encoding for cities with spaces (e.g., "San Sebasti√°n"), use urllib.parse.quote():

*pythonfrom urllib.parse import quote*

*city_encoded = quote("San Sebasti√°n")  # ‚Üí "San%20Sebasti%C3%A1n"*

In [15]:
def create_driver(headless=False):
    """Create optimized Selenium driver with anti-detection."""
    options = Options()
    
    if headless:
        options.add_argument("--headless=new")
        options.add_argument("--window-size=1920,1080")  # Set size in headless mode
    
    # Your original settings
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--incognito")
    
    # Anti-detection
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    
    # Only maximize if not headless
    if not headless:
        driver.maximize_window()  # Better than set_window_size!
    
    # Timeouts
    driver.set_page_load_timeout(30)
    driver.implicitly_wait(10)
    
    # Extra stealth
    driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": """
            Object.defineProperty(navigator, 'webdriver', {
                get: () => undefined
            })
        """
    })
    
    return driver

print("‚úÖ Driver is set up!")

‚úÖ Driver is set up!


1. headless:
   - headless=False as "training mode" (you can see what's happening)
   - headless=True as "production mode" (faster, deployed)
2. disable-blink-features=AutomationControlled
   - The Problem:
   - When Selenium controls Chrome, it sets a JavaScript property:
   - javascriptnavigator.webdriver === true  // "I'm a bot!" --> Websites like Booking.com check this and can block you!The Solution:
   - pythonchrome_options.add_argument("--disable-blink-features=AutomationControlled") --> This removes that flag. Now:
   - javascriptnavigator.webdriver === undefined  // "I'm a normal browser!"
   - The Hook: Imagine wearing a badge that says "I'M A ROBOT" everywhere. This removes the badge!
3. Custom User-Agent:
   - The Problem:
   - Default Chrome via Selenium sends a user-agent like:
```
Mozilla/5.0 ... Chrome/120.0.0.0 HeadlessChrome/120.0.0.0
                                 ‚Üë "HeadlessChrome" = BOT FLAG!
```
   - The solution:
  
```
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
```
   - This makes you look like a normal Windows user with regular Chrome.
   - The Hook: It's like changing your caller ID from "SPAM LIKELY" to "John Smith"!

In [16]:
def handle_popups(driver, timeout=10):
    """
    Handle cookie popup and sign-in modal that Booking.com shows.
    
    Args:
        driver: Selenium WebDriver instance
        timeout (int): Max seconds to wait for popups
    
    Returns:
        dict: Status of what was handled
    """
    status = {'cookies': False, 'signin': False}
    
    # ============================================================
    # 1. Handle Cookie Consent
    # ============================================================
    try:
        cookie_button = WebDriverWait(driver, timeout).until(
            EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
        )
        cookie_button.click()
        time.sleep(1)
        print("   ‚úÖ Cookies accepted")
        status['cookies'] = True
    except TimeoutException:
        print("   ‚ÑπÔ∏è  No cookie popup detected")
    except Exception as e:
        print(f"   ‚ö†Ô∏è  Cookie handling error: {e}")
    
    # ============================================================
    # 2. Handle Sign-in Modal (Genius popup)
    # ============================================================
    try:
        # Try multiple possible selectors for the close button
        signin_selectors = [
            "button[aria-label='Dismiss sign in information.']",
            "button[aria-label='Dismiss sign-in info.']",
            "button.bui-modal__close",
            "button[data-testid='genius-onboarding-modal-close-button']"
        ]
        
        for selector in signin_selectors:
            try:
                close_button = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
                )
                close_button.click()
                time.sleep(1)
                print("   ‚úÖ Sign-in popup dismissed")
                status['signin'] = True
                break
            except:
                continue
                
        if not status['signin']:
            print("   ‚ÑπÔ∏è  No sign-in popup detected")
            
    except Exception as e:
        print(f"   ‚ö†Ô∏è  Sign-in handling error: {e}")
    
    return status

Key Concept: WebDriverWait vs time.sleep

- ‚ùå BAD: What if popup takes 7 seconds to appear?
```
time.sleep(5)  # Too short? Script fails!
time.sleep(10) # Wastes 5 seconds if popup appears in 2s
```
- ‚úÖ GOOD: Waits exactly as long as needed (max 10s)
```
WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.ID, "button"))
)
```
The Hook: WebDriverWait is like a smart alarm clock - it goes off when the thing happens OR after 10 seconds, whichever comes first!

Why Multiple Selectors?
```
signin_selectors = [
    "button[aria-label='Dismiss sign in information.']",  # Current version
    "button[aria-label='Dismiss sign-in info.']",          # Variant
    "button.bui-modal__close",                             # Generic modal close
]
```
Booking.com might show different modal versions:

- Different languages (English, Spanish)
- A/B testing (they show different versions to different users)
- Updates to their codebase

Having fallbacks makes your scraper robust!

In [17]:
def test_basic_scraping():
    """Test that we can access Booking.com and handle all popups."""
    print("üß™ Testing basic setup...\n")
    
    driver = create_driver(headless=False)
    
    try:
        url = generate_booking_url(TREATMENT_CITY, TREATMENT_CHECKIN, NIGHTS, ADULTS, ROOMS)
        print(f"üîó Generated URL:\n{url}\n")
        
        print("üåê Loading Booking.com...")
        driver.get(url)
        
        # Handle popups (cookies + sign-in)
        print("üç™ Handling popups...")
        popup_status = handle_popups(driver)
        
        # Wait for page to fully load
        time.sleep(3)
        
        # Maximize window for better visibility
        driver.maximize_window()
        
        # Take a screenshot
        driver.save_screenshot("test_booking_page.png")
        print("\nüì∏ Screenshot saved as 'test_booking_page.png'")
        
        # Try to find hotel cards
        print("\nüè® Looking for hotel cards...")
        try:
            hotel_cards = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='property-card']")
            print(f"   ‚úÖ Found {len(hotel_cards)} hotels!")
            
            if hotel_cards:
                first_card = hotel_cards[0]
                try:
                    name_elem = first_card.find_element(By.CSS_SELECTOR, "div[data-testid='title']")
                    print(f"   First hotel: {name_elem.text}")
                    
                    # Also try to get price
                    try:
                        price_elem = first_card.find_element(By.CSS_SELECTOR, "span[data-testid='price-and-discounted-price']")
                        print(f"   First hotel price: {price_elem.text}")
                    except:
                        print("   ‚ö†Ô∏è  Could not extract price")
                        
                except:
                    print("   ‚ö†Ô∏è  Could not extract hotel details")
            
        except NoSuchElementException:
            print("   ‚ùå No hotel cards found!")
            print("   Possible issues:")
            print("      1. Page still loading")
            print("      2. Booking.com changed HTML structure")
            print("      3. Bot detection / CAPTCHA")
        
        print("\n" + "="*60)
        print("‚úÖ TEST SUCCESSFUL!")
        print("   If you see hotels in the browser and screenshot,")
        print("   you're ready to run the full scraper!")
        print("="*60)
        
        # Keep browser open for manual inspection
        input("\n‚è∏Ô∏è  Press Enter to close the browser...")
        
    finally:
        driver.quit()
        print("\nüîí Browser closed. Test complete!")

# Run the test
test_basic_scraping()

üß™ Testing basic setup...

üîó Generated URL:
https://www.booking.com/searchresults.html?ss=Valencia&checkin=2026-03-14&checkout=2026-03-21&group_adults=2&no_rooms=1&group_children=0

üåê Loading Booking.com...
üç™ Handling popups...
   ‚úÖ Cookies accepted
   ‚úÖ Sign-in popup dismissed

üì∏ Screenshot saved as 'test_booking_page.png'

üè® Looking for hotel cards...
   ‚úÖ Found 25 hotels!
   First hotel: Apartamento R√≠o Turia Jirafa
   First hotel price: ‚Ç¨ 1,531

‚úÖ TEST SUCCESSFUL!
   If you see hotels in the browser and screenshot,
   you're ready to run the full scraper!

üîí Browser closed. Test complete!


In [18]:
def scrape_hotels_from_search(driver, city, checkin_date, treat_city, treat_period):
    """
    Scrape hotel data from a Booking.com search results page.
    """
    url = generate_booking_url(city, checkin_date, NIGHTS, ADULTS, ROOMS)
    print(f"\nüîç Scraping {city} for check-in {checkin_date.strftime('%Y-%m-%d')}")
    
    driver.get(url)
    
    # Handle popups (cookies + sign-in) - UPDATED!
    handle_popups(driver, timeout=5)
    
    # Wait for results to load
    wait_time = random.uniform(3, 6)
    print(f"   ‚è±Ô∏è  Waiting {wait_time:.1f}s for page to load...")
    time.sleep(wait_time)
    
    # Scroll to trigger lazy-loading
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
    time.sleep(2)
    
    # ... rest of the function stays the same ...
    hotels_data = []
    
    try:
        hotel_cards = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='property-card']")
        print(f"   üìä Found {len(hotel_cards)} hotel cards")
        
        if len(hotel_cards) == 0:
            print("   ‚ö†Ô∏è  WARNING: No hotels found!")
            driver.save_screenshot(f"error_{city}_{checkin_date.strftime('%Y%m%d')}.png")
            return []
        
    except NoSuchElementException:
        print("   ‚ùå ERROR: Could not find hotel cards!")
        driver.save_screenshot(f"error_{city}_{checkin_date.strftime('%Y%m%d')}.png")
        return []
    
    # Extract data from each hotel card
    for idx, card in enumerate(hotel_cards):
        try:
            # Extract hotel name
            try:
                name_elem = card.find_element(By.CSS_SELECTOR, "div[data-testid='title']")
                hotel_name = name_elem.text.strip()
            except NoSuchElementException:
                print(f"   ‚ö†Ô∏è  Hotel {idx+1}: Could not find name")
                continue
            
            # Extract price
            price = None
            try:
                price_elem = card.find_element(By.CSS_SELECTOR, "span[data-testid='price-and-discounted-price']")
                price_text = price_elem.text
                price_cleaned = re.sub(r'[^\d.]', '', price_text)
                
                if price_cleaned:
                    price = float(price_cleaned)
                
            except NoSuchElementException:
                pass
            except ValueError:
                print(f"   ‚ö†Ô∏è  Hotel {idx+1}: Could not parse price '{price_text}'")
            
            # Get hotel URL
            try:
                link_elem = card.find_element(By.TAG_NAME, "a")
                hotel_url = link_elem.get_attribute("href")
            except NoSuchElementException:
                print(f"   ‚ö†Ô∏è  Hotel {idx+1}: Could not find URL")
                hotel_url = None
            
            # Store the data
            hotel_record = {
                'city': city,
                'hotel': hotel_name,
                'date': checkin_date,
                'price': price,
                'treatCity': treat_city,
                'treatPeriod': treat_period,
                'hotel_url': hotel_url,
                'text': None
            }
            
            hotels_data.append(hotel_record)
            
            if (idx + 1) % 10 == 0:
                print(f"   ... processed {idx + 1}/{len(hotel_cards)} hotels")
            
        except Exception as e:
            print(f"   ‚ö†Ô∏è  Hotel {idx+1}: Unexpected error - {e}")
            continue
    
    prices_found = sum(1 for h in hotels_data if h['price'] is not None)
    print(f"   ‚úÖ Extracted {len(hotels_data)} hotels ({prices_found} with prices)")
    
    return hotels_data

In [19]:
def scrape_hotel_description(driver, hotel_url, hotel_name):
    """
    Navigate to a hotel's page and extract its description.
    """
    if not hotel_url:
        return ""
    
    try:
        driver.get(hotel_url)
        time.sleep(random.uniform(2, 4))
        
        # Handle popups on hotel page too - UPDATED!
        handle_popups(driver, timeout=3)
        
        # Try multiple selectors
        selectors = [
            "div[data-testid='property-description']",
            "div#property_description_content",
            "div.hp_desc_main_content",
            "p[data-testid='property-description-text']"
        ]
        
        for selector in selectors:
            try:
                desc_elem = driver.find_element(By.CSS_SELECTOR, selector)
                description = desc_elem.text.strip()
                
                if len(description) > 50:
                    return description
                    
            except NoSuchElementException:
                continue
        
        print(f"      ‚ö†Ô∏è  No description found for {hotel_name}")
        return ""
        
    except TimeoutException:
        print(f"      ‚ö†Ô∏è  Timeout loading {hotel_name}")
        return ""
    except Exception as e:
        print(f"      ‚ö†Ô∏è  Error for {hotel_name}: {e}")
        return ""