GROUP 8:  Viktoriia Yuzkiv, Sebastien Boxho,Arianna Michelangelo

This notebook includes the pipeline to scrape Booking.com listings data and saves the data into a csv file.


Cities:
- Barcelona
- Marseille
- Valencia
- Porto

Dates:
- MWC Barcelona: 26 - 29 February 2024 (selected period: 23 Feb - 3 Mar)
- The second period: March 8 - 17th


## Import libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException

import time
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

## Open and scrape Booking

In [2]:
# Path to your geckodriver file - please change to your path before running the code
geko_path = './geckodriver' 

# Please change to your profile path (if you have one)
profile_path = '/Users/viktoriia/Library/Application Support/Firefox/Profiles/k7kr4dw0.Viktoriia' 

# Booking link
link = 'https://www.booking.com/index.en-gb.html'

In [4]:
def start_up(link, geko_path, profile_path):
    '''
    Function to set up the browser and open the selected link (usin)
    '''
    if profile_path:
        firefox_options = webdriver.FirefoxOptions()
        firefox_options.add_argument(f'--profile={profile_path}')
        service = Service(geko_path)
        browser = webdriver.Firefox(service=service, options=firefox_options)
    
    else:
        profile = webdriver.FirefoxProfile()
        options = Options()
        options.profile = profile
        service = Service(geko_path)
        browser = webdriver.Firefox(service=service, options=options)
    
    
    # Website address here
    browser.get(link)
    time.sleep(2)  # Adjust sleep time as needed
    
    # Maximize the browser window for full screen
    browser.maximize_window()
    
    if not profile_path:
        # Click on "Accept cookies"
        browser.find_element(by='xpath',value='//button[@id="onetrust-accept-btn-handler"]').click()
    
    return browser

In [5]:
def check_and_click(browser, path, type):
    '''
    Function that checks whether the object is clickable and, if so, clicks on
    it. If not, waits one second and tries again. After 3 seconds of waiting, it doesn't try anymore.
    '''
    start_time = time.time()  # Record the start time
    while True:
        try:
            if type == "xpath":
                browser.find_element('xpath',path).click()
                return "Clicked!"  # Element found and clicked successfully
            elif type == "css":
                browser.find_element('css selector', path).click() 
                return "Clicked!"  # Element found and clicked successfully
        except NoSuchElementException:
            pass  # Continue if element not found
        except Exception as e:
            print(f"An error occurred: {e}")
            return False  # Other unexpected errors

        time.sleep(1)
        elapsed_time = time.time() - start_time
        if elapsed_time >= 3:
            # print("* The element was not found in the page. *")
            return None            
            
            
            
def check_obscures(browser, xpath, type):
    '''
    Function that checks whether the object is being "obscured" by any element so
    that it is not clickable. Important: if True, the object is going to be clicked!
    '''
    try:
        if type == "xpath":
            browser.find_element('xpath',xpath).click()
        elif type == "id":
            browser.find_element('id',xpath).click()
        elif type == "css":
            browser.find_element('css selector',xpath).click()
        elif type == "class":
            browser.find_element('class name',xpath).click()
        elif type == "link":
            browser.find_element('link text',xpath).click()
    except (ElementClickInterceptedException, NoSuchElementException, StaleElementReferenceException) as e:
        print(e)
        return False
    return True

In [6]:
def select_place(browser, place):
    '''
    Function to choose a place of interest
    '''
    search1 = browser.find_element(by='xpath',value='//*[@id=":re:"]')
    search1.send_keys(place)
    time.sleep(2)
    
    
def select_dates(browser, start_date, end_date, month = 'enero 2024'):
    '''
    Function to choose dates of interest
    '''
    # Click on the dates filter
    css='button.ebbedaf8ac:nth-child(2) > span:nth-child(1)'
    browser.find_element('css selector',css).click()
    time.sleep(2)
    
    # Open dates filter on the month of interest
    while browser.find_element('xpath','//h3[@class="e1eebb6a1e ee7ec6b631"]').text != month:
        path = '//button[@class="a83ed08757 c21c56c305 f38b6daa18 d691166b09 f671049264 deab83296e f4552b6561 dc72a8413c f073249358"]'
        browser.find_element('xpath',path).click() 
    
    # Get the list of all dates
    path='//div[@id="calendar-searchboxdatepicker"]//table[@class="eb03f3f27f"]//tbody//td[@class="b80d5adb18"]//span[@class="cf06f772fa"]'
    dates = browser.find_elements('xpath',path)
    time.sleep(2)
    for date in dates:
        if date.get_attribute("data-date") == start_date:
            date.click()
        if date.get_attribute("data-date") == end_date:
            date.click()
            break

In [7]:
def get_number_pages(browser):
    '''
    Function to get total number of pages. 
    '''
    a = browser.find_elements('xpath', '//button[@class="a83ed08757 a2028338ea"]')
    return(int(a[-1].text))


def get_hotels(browser):
    '''
    Functio to get hotels from a page. Returns a pandas dataframe.
    '''
    temp_list = []
    hotel_placeholder =  browser.find_elements('xpath','//div[@class="c066246e13"]')
    
    for hotel in hotel_placeholder:
        hotel_name = hotel.find_element('xpath','.//div[@class="f6431b446c a15b38c233"]').text
        hotel_price = hotel.find_element('xpath','.//span[@class="f6431b446c fbfd7c1165 e84eb96b1f"]').text 
        hotel_link = hotel.find_element('xpath','.//a[@class="a78ca197d0"]').get_attribute('href')
       
        try:
            hotel_description = hotel.find_element('xpath','.//div[@class="abf093bdfe d323a31618"]').text
        except NoSuchElementException:
            hotel_description = np.nan
        
        try:
            hotel_rating = hotel.find_element('xpath','.//div[@class="a3b8729ab1 d86cee9b25"]').text   
        except NoSuchElementException:
            hotel_rating = np.nan
        
        temp_list.append({'name': hotel_name, 'price': hotel_price,
                      'description_short': hotel_description, 'rating': hotel_rating,
                      'url': hotel_link})
    
        print(hotel_name)
        
    return temp_list 


In [8]:
# Change place here or add other places
places = ['Barcelona']   # ['Barcelona', 'Marseille']

# Change dates here
dates = [{'start_date': '2024-03-08', 'end_date': '2024-03-17', 'month': 'March 2024'}]


df = pd.DataFrame(columns = ['place', 'start_date', 'end_date', 'name', 'price', 'description_short', 'rating', 'url'])

for date in dates:
    start_date = date['start_date']
    end_date = date['end_date']
    month = date['month']
    
    for place in places:
        
        # Open home page
        
        # If you have a Firefox profile with ad-blocker, please add your profile_path here.
        # Otherwise, no changes in the code are needed, but the Google pop-up will have to be closed manually.
        browser = start_up(link, geko_path, profile_path=False)
        

        time.sleep(5)
        
        # Check for the pop up on the first page and close it if it appears
        check_and_click(browser, '.f4552b6561', 'css')
         
        # Choose the place
        select_place(browser, place)

        # Choose the dates
        select_dates(browser, start_date, end_date, month)

        # Click on the search button
        my_xpath='/html/body/div[3]/div[2]/div/form/div[1]/div[4]/button/span'
        check_and_click(browser,my_xpath , type='xpath')
        
        time.sleep(3)
        
        # Click on the another pop up
        check_and_click(browser, 'div.cc1b961f14:nth-child(1) > button:nth-child(1)', 'css')
        
        # Get the number of pages
        pages = get_number_pages(browser)
        
        for p in range(pages):
            
            df_temp = pd.DataFrame(get_hotels(browser))
            df_temp['place'] = place
            df_temp['start_date'] = start_date
            df_temp['end_date'] = end_date
            
            df = pd.concat([df, df_temp], ignore_index=True)
            time.sleep(2)
            # Click on the next button
            css= 'div.b16a89683f:nth-child(3) > button:nth-child(1)'
            browser.find_element('css selector',css).click() 
            time.sleep(2)
             

Duquesa Suites Barcelona
Sonder Casa Luz
Very quiet and centric area to enjoy Barcelona
TWO Hotel Barcelona by Axel 4* Sup- Adults Only
Sallés Hotel Pere IV
The Hoxton, Poblenou
Vincci Bit
The Streets Apartments Rambla
Travelodge Barcelona Poblenou
Hotel America Barcelona
Four Points by Sheraton Barcelona Diagonal
Hotel & Spa Villa Olimpica Suites
Vincci Maritimo
NH Sants Barcelona
Catalonia Sagrada Familia
Barcelonaforrent Urban Town Suites
Hotel Barcelona Center
Hostel New York
Zenit Barcelona
Hilton Barcelona
Hostal Boqueria
RAMBLAS HOTEL powered by Vincci Hoteles
Catalonia Atenas
NH Barcelona Diagonal Center
Acta CITY47
Hotel Lloret Ramblas
Hotel Concordia Barcelona
Leonardo Hotel Barcelona Gran Via
Occidental Atenea Mar - Adults Only
ITC Barcelona By Soho Boutique
Hesperia Barcelona Presidente
Leonardo Royal Hotel Barcelona Fira
Acacia Suite
Gran Hotel Havana 4Sup by Escampa Hotels
Nice apartment BCN two rooms wifi
Radisson Blu 1882 Hotel, Barcelona Sagrada Familia
NH Barcelona Le

Alba Hotel
ABAL554 - Azotea exclusiva del centro de la ciudad
Cosmo Apartments Passeig de Gràcia
Estudio para parejas
Aspasios Garden Apartments
Aparthotel Mariano Cubi Barcelona
BUHO Boutique Rooms
Apartments Sata Olimpic Village Area
Suitur apartamento floridablanca barcelona
Espais Blaus Apartments
Rooms Arago
Hostal Bcn 46
Inside Barcelona Apartments Salva
El Avenida Palace
Apartaments-Hotel Hispanos 7 Suiza
H10 Port Vell 4* Sup
Midtown Apartments
Aspasios Las Ramblas Apartments
Sonder Paseo de Gracia
BBarcelona Clot Apartments
Hotel Ciutadella Barcelona
Rufus Port Forum
La Mare by The Streets
AinB Sagrada Familia Apartments
Alos Apartments Paseo de Gracia-Diagonal
Fontanella By BCN URBAN Rooms
Cosmo Apartments Sants
Hotel Suizo
Plaza Goya Rooms
Hostemplo Sagrada Familia
The One Barcelona GL
Jardinets De Gràcia by The 5ve Soul
Renaissance Barcelona Hotel
Praktik Èssens
Duquesa Suites Barcelona
ibis Styles Barcelona City Bogatell
B&B Boutique Ca La Maria
ABaC Restaurant Hotel Barcel

Fisa Rentals Les Corts Apartments
AB Poble Sec Apartment
Eixample 7
The Barcelona Centric APT
Hostal Mont Thabor
Enjoybcn Gaudi Apartments
Modern and bright apartment in Eixample 1-2
The Vintage Rentals
Apartment close to Diagonal Beach
W Barcelona
Barcelona4Seasons II
GuestReady - Charming simplicity near Casa Milà
homeinnbcn
Spectacular Five Senses Mallorca Apartment
limehome Barcelona Rector Triadó
Vale Beach
Ibis Barcelona Centro (Sagrada Familia)
Valencia 455
Apartamento soleado en Barcelona city
Cozy Studio with Amazing Terrace near Plaza España
Enjoybcn Patio de Gracia
Piso Zona Plaza España, junto a Montjuic,
Sonder Casa Luz
Barcelona - Sagrada Familia
MR Patio
Stay Barcelona Gracia Apartments
2 bedroom with balcony in Eixample 1-1
Wonderful 2-bd apart with balcony Plaza Cataluña
Parc Güell 2 1
Sweet Inn - Verdi
L'alzina Apartment
Céntrico apartamento
Luxury 1BR with Patio near the Beach
Girona 108
Stay Barcelona Sant Martí
Lodging Apartments Barceloneta Beach Studio 32
Casa fa

AB Paral·lel Apartments
You Stylish Sagrada Familia Apartments
Click&Flat Seneca Suites
Penthouse Pza España - 4th Floor
Port Forum Apartments by Olala Homes
Crysoyle Apartment Barcelona Next to Camp Nou
Cozy Apartament in the center
Cozy near the Barcelona Camp Nou Stadium
NEW Renovated COZY APPARTMENT CENTRIC
AB Sant Antoni Studio
GIR1A · GIR1A -Modernist apartment next to Paseo de Gracia
Wonderful boat for families and friends
Boat to sleep in Barcelona
Urban Flats Vallirana
The Corner Apartments by Aspasios
Apartamentos Navas 2
Aspasios Market Balconies
Stay U-nique Apartments Caminal
Stay U-nique Apartments Tapioles
Classbedroom Born Apartments
Sunny Apartments Barcelona
Stay U-nique Apartments Travessera II
Sweet Inn - Carrer Bailen
Stay U-nique Apartments Rambla Catalunya III
Apartamento Via Augusta 153
AB Nou de la Rambla
Hotel España Ramblas
Homenfun Barcelona Sagrada Familia Aragón
Pere Serafi 2 bedroom apartment
Eric Vökel Boutique Apartments - Sagrada Familia Suites
Click&F

In [12]:
df.head(10)

Unnamed: 0,place,start_date,end_date,name,price,description_short,rating,url
0,Barcelona,2024-03-08,2024-03-17,Duquesa Suites Barcelona,"€ 2,003",,8.8,https://www.booking.com/hotel/es/duquesa-suite...
1,Barcelona,2024-03-08,2024-03-17,Sonder Casa Luz,"€ 1,674",,8.4,https://www.booking.com/hotel/es/casa-luz-barc...
2,Barcelona,2024-03-08,2024-03-17,Very quiet and centric area to enjoy Barcelona,€ 383,,,https://www.booking.com/hotel/es/very-quiet-an...
3,Barcelona,2024-03-08,2024-03-17,TWO Hotel Barcelona by Axel 4* Sup- Adults Only,"€ 1,361",,8.4,https://www.booking.com/hotel/es/two-barcelona...
4,Barcelona,2024-03-08,2024-03-17,Sallés Hotel Pere IV,"€ 1,492",,8.1,https://www.booking.com/hotel/es/pereiv.en-gb....
5,Barcelona,2024-03-08,2024-03-17,"The Hoxton, Poblenou","€ 1,601",,8.7,https://www.booking.com/hotel/es/the-hoxton-po...
6,Barcelona,2024-03-08,2024-03-17,Vincci Bit,"€ 1,649",,8.4,https://www.booking.com/hotel/es/vincci-bit.en...
7,Barcelona,2024-03-08,2024-03-17,The Streets Apartments Rambla,"€ 1,834",Entire studio • 1 bathroom • 1 kitchen • 40m²,9.2,https://www.booking.com/hotel/es/the-streets-a...
8,Barcelona,2024-03-08,2024-03-17,Travelodge Barcelona Poblenou,€ 974,,7.3,https://www.booking.com/hotel/es/travelodge-ba...
9,Barcelona,2024-03-08,2024-03-17,Hotel America Barcelona,"€ 1,397",,8.6,https://www.booking.com/hotel/es/america-barce...


In [13]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}


# Function to scrape the descriptions using Beautiful Soup
def scrape_description(url):
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status() 
        # time.sleep(0.5)
    except requests.exceptions.RequestException as e:
        print(f"Error processing {url}: {e}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')
    description_tag = soup.find('p', class_='a53cbfa6de b3efd73f69')

    if description_tag:
        return description_tag.get_text(strip=True)
    else:
        print(f"Description tag not found on the page: {url}")
        return None

    

# Set the number of concurrent threads (adjust this based on the processing power of your computer)
num_threads = 16

# Create a ThreadPoolExecutor to run operations in parallel
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    # Use executor.map to apply the scrape_description function to each URL in parallel
    descriptions = []
    for i, description in enumerate(executor.map(scrape_description, df['url']), start=1):
        descriptions.append(description)
        # Print every 50 link to check the progess of the scraping
        if i % 50 == 0:
            print(f"Scraped {i} links")

# Add  descriptions to the 'description' column in the df
df['description'] = descriptions

# Print count after all threads have completed
print(f"Scraped {len(descriptions)} links")
print("\nDone!\n")

Scraped 50 links
Scraped 100 links
Scraped 150 links
Scraped 200 links
Scraped 250 links
Scraped 300 links
Scraped 350 links
Error processing https://www.booking.com/hotel/es/apartments-in-barcelona-sagrada-familia.en-gb.html?label=gen173nr-1FCAEoggI46AdICVgEaEaIAQGYAQm4AQfIAQ_YAQHoAQH4AQKIAgGoAgO4AoWluq0GwAIB0gIkMTk3MmE3NzEtNDAxOS00NWUwLWE5ZjEtYWYxNWU2YTMzMTg52AIF4AIB&aid=304142&ucfs=1&arphpl=1&checkin=2024-03-08&checkout=2024-03-17&dest_id=-372490&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&hpos=18&hapos=368&sr_order=popularity&srpvid=66eb714cd43f012d&srepoch=1705939689&all_sr_blocks=41093403_91474447_2_0_0&highlighted_blocks=41093403_91474447_2_0_0&matching_block_id=41093403_91474447_2_0_0&sr_pri_blocks=41093403_91474447_2_0_0__191300&from_sustainable_property_sr=1&from=searchresults#hotelTmpl: HTTPSConnectionPool(host='www.booking.com', port=443): Read timed out.
Scraped 400 links
Scraped 450 links
Scraped 500 links
Scraped 550 links
Scrape

In [14]:
df.head()

Unnamed: 0,place,start_date,end_date,name,price,description_short,rating,url,description
0,Barcelona,2024-03-08,2024-03-17,Duquesa Suites Barcelona,"€ 2,003",,8.8,https://www.booking.com/hotel/es/duquesa-suite...,"Set the centre of Barcelona, 400 metres from P..."
1,Barcelona,2024-03-08,2024-03-17,Sonder Casa Luz,"€ 1,674",,8.4,https://www.booking.com/hotel/es/casa-luz-barc...,"Set in Barcelona, Sonder Casa Luz offers a ter..."
2,Barcelona,2024-03-08,2024-03-17,Very quiet and centric area to enjoy Barcelona,€ 383,,,https://www.booking.com/hotel/es/very-quiet-an...,Very quiet and centric area to enjoy Barcelona...
3,Barcelona,2024-03-08,2024-03-17,TWO Hotel Barcelona by Axel 4* Sup- Adults Only,"€ 1,361",,8.4,https://www.booking.com/hotel/es/two-barcelona...,Offering a rooftop pool with a large sun terra...
4,Barcelona,2024-03-08,2024-03-17,Sallés Hotel Pere IV,"€ 1,492",,8.1,https://www.booking.com/hotel/es/pereiv.en-gb....,"Pere IV is situated in the Poble Nou District,..."


In [None]:
# Save to csv
#df.to_csv('./data/test_data.csv')

## Load data

In [27]:
Barcelona_period1 = pd.read_csv('barcelona_p1.csv', index_col=0)
Marseille_period1 = pd.read_csv('marseille_p1.csv', index_col=0)
Porto_period1 = pd.read_csv('porto_p1.csv', index_col=0)
Valencia_period1 = pd.read_csv('valencia_p1.csv', index_col=0)


Barcelona_period2 = pd.read_csv('barcelona_p2.csv', index_col=0)
Marseille_period2 = pd.read_csv('marseille_p2.csv', index_col=0)
Porto_period2 = pd.read_csv('porto_p2.csv', index_col=0)
Valencia_period2 = pd.read_csv('valencia_p2.csv' , index_col=0)


In [31]:
Barcelona_period1.head()

Unnamed: 0,place,start_date,end_date,name,price,description_short,rating,url,description
0,Barcelona,2024-02-23,2024-03-03,Duquesa Suites Barcelona,"€ 3,491",,8.8,https://www.booking.com/hotel/es/duquesa-suite...,"Set the centre of Barcelona, 400 metres from P..."
1,Barcelona,2024-02-23,2024-03-03,Sonder Casa Luz,"€ 4,457",,8.4,https://www.booking.com/hotel/es/casa-luz-barc...,"Set in Barcelona, Sonder Casa Luz offers a ter..."
2,Barcelona,2024-02-23,2024-03-03,Valencia 2,"€ 1,395",,,https://www.booking.com/hotel/es/valencia-2.en...,"Located in Barcelona, 1.2 km from Passeig de G..."
3,Barcelona,2024-02-23,2024-03-03,Fuster Apartments by Aspasios,"€ 2,448",Entire apartment • 2 bedrooms • 1 living room ...,9.2,https://www.booking.com/hotel/es/fuster-apartm...,Fuster Apartments are just 150 metres from Dia...
4,Barcelona,2024-02-23,2024-03-03,BarcelonaForRent The Central Place,"€ 5,350",Entire apartment • 1 bedroom • 1 living room •...,8.5,https://www.booking.com/hotel/es/barcelonaforr...,"Offering views of Casa Batlló, BarcelonaForRen..."


In [32]:
# Clean the price column

Barcelona_period1['price']= Barcelona_period1['price'].str.replace('€', '').str.replace(',', '')
Barcelona_period1['price'] = Barcelona_period1['price'].astype(int)

In [33]:
Barcelona_period1.head()

Unnamed: 0,place,start_date,end_date,name,price,description_short,rating,url,description
0,Barcelona,2024-02-23,2024-03-03,Duquesa Suites Barcelona,3491,,8.8,https://www.booking.com/hotel/es/duquesa-suite...,"Set the centre of Barcelona, 400 metres from P..."
1,Barcelona,2024-02-23,2024-03-03,Sonder Casa Luz,4457,,8.4,https://www.booking.com/hotel/es/casa-luz-barc...,"Set in Barcelona, Sonder Casa Luz offers a ter..."
2,Barcelona,2024-02-23,2024-03-03,Valencia 2,1395,,,https://www.booking.com/hotel/es/valencia-2.en...,"Located in Barcelona, 1.2 km from Passeig de G..."
3,Barcelona,2024-02-23,2024-03-03,Fuster Apartments by Aspasios,2448,Entire apartment • 2 bedrooms • 1 living room ...,9.2,https://www.booking.com/hotel/es/fuster-apartm...,Fuster Apartments are just 150 metres from Dia...
4,Barcelona,2024-02-23,2024-03-03,BarcelonaForRent The Central Place,5350,Entire apartment • 1 bedroom • 1 living room •...,8.5,https://www.booking.com/hotel/es/barcelonaforr...,"Offering views of Casa Batlló, BarcelonaForRen..."
