# Scraping Booking for Primavera Sound

In [135]:
# Core imports
import pandas as pd
import numpy as np
import requests
import time
import random
import re
from datetime import datetime, timedelta

# Selenium imports
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import (
    TimeoutException, 
    NoSuchElementException,
    StaleElementReferenceException
)

# For automatic chromedriver management
from webdriver_manager.chrome import ChromeDriverManager

# For parsing HTML
from bs4 import BeautifulSoup

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# import datetime
from datetime import datetime

import pandas as pd

## Open the Booking.com in workable condition

In [2]:
def open_website(website_url, headless_bool=False):
    # Create a simple driver (WITH visible browser window)
    options = Options()

    if headless_bool:
        options.add_argument("--headless=new")

    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--incognito")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    print("✅ Browser opened!")
    time.sleep(3)  # Wait for page to load

    driver.get(website_url)
    time.sleep(3)  # Wait for page to load

    print("✅ Navigated to Booking.com search results")
    print(f"Page title: {driver.title}")

    driver.set_window_size(1400, 900)

    # accept cookies
    accept_button = driver.find_element(By.ID, "onetrust-accept-btn-handler")
    accept_button.click()  # Example: Accept cookies button
    time.sleep(5)  # Wait for page to load

    
    wait = WebDriverWait(driver, 10)

    close_signin = wait.until(
        EC.element_to_be_clickable(
            (By.CSS_SELECTOR, "button[aria-label='Dismiss sign-in info.']")
        )
    )
    close_signin.click()
    
    return driver


## Search the chosen city and dates

In [13]:
def search(driver, city, start_date, end_date):

    wait = WebDriverWait(driver, 10)

    # type in selected city
    city_input = driver.find_element(By.NAME, "ss")
    city_input.send_keys(city)
    time.sleep(2)

    js = """
    document.addEventListener("mousemove", e => {
        const box = document.getElementById("__coords__") || (() => {
            const d = document.createElement("div");
            d.id = "__coords__";
            d.style.cssText = `
                position:fixed;top:0;left:0;z-index:999999;
                background:black;color:lime;
                font:12px monospace;padding:4px;
                pointer-events:none;
            `;
            document.body.appendChild(d);
            return d;
        })();

        box.textContent = `x:${e.clientX} y:${e.clientY}`;
    });
    """

    driver.execute_script(js)

    # select the city
    x = 250
    y = 420
    driver.execute_script(f"document.elementFromPoint({x}, {y}).click();")
    time.sleep(2)

    # scroll to necessary month
    next_btn = wait.until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='Next month']"))
    )

    start = datetime.strptime(start_date, "%Y-%m-%d")
    today = datetime.today()
    months = (start.year - today.year) * 12 + (start.month - today.month)

    for _ in range(months):
        time.sleep(1)
        next_btn.click()

    # select dates
    wait.until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, f"span[data-date='{start_date}']"))
    ).click()

    time.sleep(2)
    
    wait.until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, f"span[data-date='{end_date}']"))
    ).click()

    # click search
    for span in driver.find_elements(By.TAG_NAME, "span"):
        if span.text.strip().lower() == "search":
            span.click()
            break
    
    print("✅ Search Complete")    
    return driver

## Get the full list of listing

In [None]:
def collect_all_hotels(driver):
    driver.execute_script("window.scrollTo(0, 0);")
    time.sleep(2)
    
    hotel_data = {}  # Use dict to avoid duplicates by link
    last_count = 0
    stagnation = 0
    MAX_STAGNATION = 20
    SCROLL_STEP = 900
    SCROLL_PAUSE = 6
    
    print("Starting scrolling ...")
    
    while True:
        # Scroll
        driver.execute_script(f"window.scrollBy(0, {SCROLL_STEP});")
        time.sleep(SCROLL_PAUSE)
        
        # Click "Load more"
        try:
            btn = driver.find_element(By.XPATH, "//span[text()='Load more results']")
            driver.execute_script("arguments[0].scrollIntoView({block:'center'});", btn)
            time.sleep(3)
            btn.click()
            time.sleep(10)
        except:
            pass
        
        # Collect hotels
        cards = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='property-card']")
        for card in cards:
            try:
                # Extract link
                link = card.find_element(By.CSS_SELECTOR, "a[data-testid='property-card-desktop-single-image']").get_attribute("href")
                
                if not link or link in hotel_data:
                    continue
                
                # Extract title
                try:
                    title = card.find_element(By.CSS_SELECTOR, "div[data-testid='title']").text
                except:
                    title = None
                
                # Extract price
                try:
                    price = card.find_element(
                        By.CSS_SELECTOR, "span[data-testid='price-and-discounted-price']"
                    ).text
                except:
                    price = None
                
                # Store data
                hotel_data[link] = {'link': link, 'hotel': title, 'price': price}
                
            except:
                continue
        
        current = len(hotel_data)
        print(f"Unique hotels: {current}")
        
        # Stop condition (data-based)
        if current == last_count:
            stagnation += 1
            print(f"No growth ({stagnation}/{MAX_STAGNATION})")
        else:
            stagnation = 0
        
        last_count = current
        
        if stagnation >= MAX_STAGNATION:
            print("No more hotels loading — confirmed.")
            break
    
    print(f"FINAL HOTEL COUNT: {len(hotel_data)}")
    
    # Convert to DataFrame
    df = pd.DataFrame.from_dict(hotel_data, orient='index')
    df = df.reset_index(drop=True)
    
    return driver, df

## extract description

In [127]:
def extract_description(driver, link):
    # Navigate to the link
    driver.get(link)

    # Maximize the window to make sure it's visible
    driver.maximize_window()

    # Wait for page to load
    time.sleep(3)

    # Bring window to front (different methods for different OS)
    try:
        driver.switch_to.window(driver.current_window_handle)
    except:
        pass

    # Verify we're on the right page
    print(f"Navigating to: {driver.title}")

    # Extract the description
    try:
        description = driver.find_element(By.CSS_SELECTOR, "p[data-testid='property-description']").text
        print("Description extracted\n")    
    except Exception as e:
        print(f"Error extracting description: {e}")
        description = None
    return description

## Run the process flow

In [143]:
booking_url = "https://www.booking.com/?lang=en-us&selected_currency=EUR"
driver = open_website(website_url=booking_url)

driver = search(driver = driver, city = "Barcelona", start_date = "2026-05-19", end_date = "2026-05-26")

✅ Browser opened!
✅ Navigated to Booking.com search results
Page title: Booking.com | Official site | The best hotels, flights, car rentals & accommodations
✅ Search Complete


In [144]:
driver, hotels_df = collect_all_hotels(driver)

# Save to CSV 
hotels_df.to_csv('booking_hotels.csv', index=False)

hotels_df["price"] = hotels_df["price"].str.replace("€ ", "")
hotels_df["price"] = hotels_df["price"].str.replace(",", "")
# hotels_df.rename(columns={"title" : "hotel"})

# Loop through first n rows
n = 3  

# Create 'text' column 
hotels_df['text'] = None

for i in range(min(n, len(hotels_df))):
    link = hotels_df.loc[i, 'link']
    title = hotels_df.loc[i, 'hotel']
    
    print(f"Processing {i+1}/{n}: {title} ")
    
    # Extract description
    description = extract_description(driver, link)

    # Store in DataFrame
    hotels_df.at[i, 'text'] = description
    
    # Sleep for 3 seconds after each round
    time.sleep(random.randint(3, 8))

print(f"\n Completed extracting descriptions for {min(n, len(hotels_df))} hotels")

Starting scrolling ...
Unique hotels: 25
Unique hotels: 25
No growth (1/20)
Unique hotels: 25
No growth (2/20)
Unique hotels: 25
No growth (3/20)
Unique hotels: 25
No growth (4/20)
Unique hotels: 25
No growth (5/20)
Unique hotels: 75
Unique hotels: 75
No growth (1/20)
Unique hotels: 75
No growth (2/20)
Unique hotels: 75
No growth (3/20)
Unique hotels: 75
No growth (4/20)
Unique hotels: 75
No growth (5/20)
Unique hotels: 75
No growth (6/20)
Unique hotels: 75
No growth (7/20)
Unique hotels: 150
Unique hotels: 200
Unique hotels: 250
Unique hotels: 300
Unique hotels: 350
Unique hotels: 400
Unique hotels: 450
Unique hotels: 491
Unique hotels: 533
Unique hotels: 578
Unique hotels: 622
Unique hotels: 666
Unique hotels: 702
Unique hotels: 748
Unique hotels: 789
Unique hotels: 833
Unique hotels: 879
Unique hotels: 926
Unique hotels: 968
Unique hotels: 1010
Unique hotels: 1058
Unique hotels: 1104
Unique hotels: 1152
Unique hotels: 1197
Unique hotels: 1246
Unique hotels: 1293
Unique hotels: 1338


In [None]:
hotels_df['city'] = "Barcelona"
hotels_df['date'] = datetime.date()
# Display results
hotels_df

# Save to CSV
# hotels_df.to_csv('hotels_with_descriptions.csv', index=False)