Hotel Scraping project

In [136]:
!pip install beautifulsoup4
!pip install selenium
!pip install requests
!pip install tqdm

Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1


In [4]:
from bs4 import BeautifulSoup, ResultSet
from datetime import datetime,timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.options import Options
import time
from typing import *

In [5]:
options = Options()
options.add_argument("--window-size=1920,1080") 

driver = webdriver.Chrome(options=options)

### Scraping logic: 
Scroll to the end of the page until you see the `Load more results` button, click it and scroll again. right now the whole page is loaded with more than 100 hotels. take the hotels from the page and extract the data from that

In [6]:
def click_load_more(driver: webdriver.Chrome) -> bool:
    try:
        # Scroll to the bottom of the page to load more content
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        load_more_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(translate(., 'LOAD MORE RESULTS', 'load more results'), 'load more results')]"))
        )
        load_more_button.click()
        return True
    except (NoSuchElementException, TimeoutException):
        return False

In [7]:
def build_url(time_to_travel: int, length_of_stay: int) -> str:
    today = datetime.today()

    checkin_date = today + timedelta(days=time_to_travel)
    checkout_date = checkin_date + timedelta(days=length_of_stay)
    checkin_date_str = checkin_date.strftime('%Y-%m-%d')
    checkout_date_str = checkout_date.strftime('%Y-%m-%d')
    url = f'https://www.booking.com/searchresults.en-gb.html?ss=New+York&ssne=New+York&ssne_untouched=New+York&lang=en-gb&dest_id=20088325&dest_type=city&checkin={checkin_date_str}&checkout={checkout_date_str}&group_adults=2&no_rooms=1&group_children=0&selected_currency=USD'
    return url


In [8]:
def close_genius_modal(driver: webdriver.Chrome) -> bool:
    try:
        genius_modal_close_button = driver.find_element(By.XPATH, '//*[@id="b2searchresultsPage"]/div[21]/div/div/div/div[1]/div[1]/div/button')
        genius_modal_close_button.click()
        return True    
    except (NoSuchElementException, TimeoutException):
        return False

In [9]:
def load_more(driver: webdriver.Chrome) -> bool:
    body = driver.find_element(By.TAG_NAME, 'body')
    body.send_keys(Keys.HOME)
    time.sleep(0.5)
    body.send_keys(Keys.END)

    try:
        button = driver.find_element(By.XPATH, "//button[span[contains(text(), 'Load more results')]]")
        button.click()
        return True    
    except Exception:
        return False

In [10]:
import bs4

def get_hotels(driver: webdriver.Chrome, url: str) -> List[bs4.element.Tag]:
    driver.get(url)
    time.sleep(3)

    soup = None
    hotels=[]
    while len(hotels) < 100:
        time.sleep(2)
        close_genius_modal(driver)
        load_more(driver)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        hotels = soup.find_all('div', {'data-testid': "property-card"})
    return hotels

In [11]:
import re

kms_from_centre = lambda text: float(re.search(r'([\d]+(?:\.\d+)?)\s*km\s+from\s+centre', text).group(1))

def extract_hotel_info(hotel: bs4.element.Tag) -> Dict[str, Any]:
    hotel_info = {}
    # basic info
    hotel_info['name'] = hotel.find('div', {'data-testid': 'title'}).text
    hotel_info['price'] = hotel.find('span', {'data-testid': 'price-and-discounted-price'}).text
    
    # review parameters
    rating_div = hotel.find('div', {'data-testid': 'review-score'})
    review_divs = rating_div.find_all('div')

    hotel_info['review_score'] = review_divs[0].get_text(separator=':)', strip=True).split(':)')[1] 
    hotel_info['review_title'] = review_divs[3].get_text(strip=True)
    hotel_info['number_of_reviews'] = review_divs[4].get_text(separator=' ', strip=True).split(' ')[0] 
    
    # star rating
    star_rating_div = hotel.find('div', {'data-testid': 'rating-stars'})
    if star_rating_div:
        hotel_info['star_rating'] = len(star_rating_div.find_all('svg'))
    else:
        hotel_info['star_rating'] = None

    # distance from centre
    distance_div = hotel.find('span', {'data-testid': 'distance'})
    if distance_div:
        distance = distance_div.get_text(strip=True)
        hotel_info['kms_from_centre'] = kms_from_centre(distance)
    else:
        hotel_info['kms_from_centre'] = None

    # location score
    location_link = hotel.find('a', {'data-testid': 'secondary-review-score-link'})
    if location_link:
        location_score_match = re.search(r'Scored\s(\d+\.\d+)', location_link['aria-label'])
        hotel_info['location_score'] = location_score_match.group(1) if location_score_match else None
    else:
        hotel_info['location_score'] = None

    # room type and bed type
    recommended_units_div = hotel.find('div', {'data-testid': 'recommended-units'})
    hotel_info['room_type'] = (recommended_units_div and 
                               recommended_units_div.h4 and 
                               recommended_units_div.h4.get_text(strip=True))
    hotel_info['bed_type'] = (recommended_units_div and 
                              recommended_units_div.ul and 
                              recommended_units_div.ul.li and 
                              recommended_units_div.ul.li.div and 
                              recommended_units_div.ul.li.div.div and 
                              recommended_units_div.ul.li.div.div.get_text(strip=True))

    # breakfast
    gallery_ribbon =  hotel.find('div', {'data-testid': 'gallery-ribbon'})
    if gallery_ribbon and 'Breakfast included' in gallery_ribbon.get_text(strip=True):
        hotel_info['breakfast_included'] = True
    else:
        hotel_info['breakfast_included'] = False

    # free cancellation
    hotel_info['free_cancellation'] = True if hotel.find('div', {'data-testid': 'cancellation-policy-icon'}) else False
    # prepayment 
    hotel_info['prepayment_needed'] = False if hotel.find('div', {'data-testid': 'prepayment-policy-icon'}) else True


    return hotel_info

In [12]:
import pandas as pd 

def get_hotels_dataframe(hotels: List[bs4.element.Tag]) -> pd.DataFrame:
    hotels_info = [extract_hotel_info(hotel) for hotel in hotels]
    return pd.DataFrame(hotels_info)

In [15]:
SNAPSHOT_DATE = datetime.today().strftime('%Y-%m-%d')
df = pd.DataFrame()
for TTT in range(1, 30 + 1):
    for LOS in range(1, 5 + 1):
        print(f'{TTT} / 30, {LOS} / 5')
        url = build_url(TTT, LOS)
        hotels = get_hotels(driver, url)
        hotels_df = get_hotels_dataframe(hotels)
        hotels_df['time_to_travel'] = TTT
        hotels_df['length_of_stay'] = LOS
        hotels_df['snapshot_date'] = SNAPSHOT_DATE

        df = pd.concat([df, hotels_df], ignore_index=True)
df.to_csv(f'booking_snapshot_{SNAPSHOT_DATE}.csv', index=False)

1 / 30, 1 / 5


KeyboardInterrupt: 