In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import re
import pandas as pd
import datetime
import pdb
from threading import Thread
import time

Here we get the hotel's distance from the center, in kilometers, using regex - for the get_hotel_data function below

In [2]:
def get_distance_from_center(txt):
    if ("מ'" in txt):
        # https://stackoverflow.com/questions/4289331/how-to-extract-numbers-from-a-string-in-python # 
        distance_from_center = float(re.findall("\d+",txt)[0])
        return distance_from_center/1000
    elif ("." in txt):
        distance_from_center = float(re.findall("\d+\.\d+", txt)[0])
        return  distance_from_center
    else:
        distance_from_center = float(re.findall("\d+",txt)[0])
        return  distance_from_center

Here we get the hotel's data, and take care of end cases as needed.

In [3]:
def get_hotel_data(hotel):   
    
    hotel_name = hotel.find("div", {"class": "fcab3ed991"}).text

    try:
        preferred_hotel = (hotel.find("span", {"data-testid": "preferred-badge"})["data-testid"] == "preferred-badge")
        preferred_hotel = 1 
        
    except:
        preferred_hotel = 0

    stars_count = len(hotel.findAll("span", {"class": "b6dc9a9e69 adc357e4f1 fe621d6382"}))
    
    ## source : https://stackoverflow.com/questions/4703390/how-to-extract-a-floating-number-from-a-string ## 
    distance_from_center = get_distance_from_center(hotel.find("span", {"data-testid": "distance"}).text)

    try:
        hotel_price_txt = hotel.find("span", {"data-testid": "price-and-discounted-price"}).text            
        if (',' not in hotel_price_txt):     
            hotel_price = int(re.findall("\d+",hotel_price_txt)[0])
                   
        else:
            hotel_price = int(re.findall("\d+\,\d+",hotel_price_txt)[0].replace(",", ""))
                         
    except AttributeError:
        hotel_price = -1 

    try:
        available_rooms_txt = hotel.find("div", {"class": "cb1f9edcd4"}).text
        if ('אחד' in available_rooms_txt):
            available_rooms  = 1
        elif ( hotel_price == -1):
            available_rooms = 0 
        else:
            available_rooms  = int(re.findall("\d+",available_rooms_txt)[0])
                           
    except AttributeError:
        if (hotel_price == -1):
            available_rooms = -1
        else:
            available_rooms = 10   
    
    if (hotel.find("div", {"class": "b5cd09854e d10a6220b4"}) is not None):
        if (hotel.find("div", {"class": "b5cd09854e d10a6220b4"}).text == '10'):
            hotel_rating = 10
        else:
            # https://stackoverflow.com/questions/4289331/how-to-extract-numbers-from-a-string-in-python # 
            hotel_rating = float(re.findall("\d+\.\d+", hotel.find("div", {"class": "b5cd09854e d10a6220b4"}).text)[0])
    else:
        hotel_rating = 0 

    
    if (hotel.find("span", {"class": "e2f34d59b1"}) is not  None):
        new_hotel = 1
    else:
        new_hotel = 0

    if (hotel.find("div", {"class": "d8eab2cf7f c90c0a70d3 db63693c62"}) is not None):
        number_of_reviews_txt = hotel.find("div", {"class": "d8eab2cf7f c90c0a70d3 db63693c62"}).text
        if (',' not in number_of_reviews_txt): 
            number_of_reviews = int(re.findall("\d+",number_of_reviews_txt)[0])
        else:
            number_of_reviews = int(re.findall("\d+\,\d+",number_of_reviews_txt)[0].replace(",", ""))
    else:
        number_of_reviews = 0
   
    return hotel_name,preferred_hotel,stars_count,distance_from_center,available_rooms,hotel_price,hotel_rating,number_of_reviews,new_hotel

 In this function we are getting the data from the page we currently are at, using our get_hotel_data function for each hotel in the page.

In [4]:
def get_data_from_page(current_url,offset,checkin_date,header):

    # https://stackoverflow.com/questions/27652543/how-to-use-python-requests-to-fake-a-browser-visit-a-k-a-and-generate-user-agent ##
    response = requests.get(current_url,headers=header).content
    soup = BeautifulSoup(response,'html.parser')
    hotels = soup.select('div[data-testid="property-card"]')
    
    # Append to our lists the current hotel data for the whole page, as len(hotels) is the
    # number of hotels we found in the current page.
    for i in range(0,len(hotels)):
        hotel_name = [] 
        available_rooms = [] 
        hotel_price = [] 
        hotel_rating = []
        preferred_hotel = [] 
        number_of_reviews = [] 
        distance_from_center = [] 
        stars_count = []
        new_hotel = []
        hotel = hotels[i]
        hotel_name,preferred_hotel,stars_count,distance_from_center,available_rooms,hotel_price,hotel_rating,number_of_reviews,new_hotel = get_hotel_data(hotel)
        hotel_name_list.append(hotel_name)
        available_rooms_list.append(available_rooms)
        hotel_price_list.append(hotel_price)
        hotel_rating_list.append(hotel_rating)
        preferred_hotel_list.append(preferred_hotel)
        number_of_reviews_list.append(number_of_reviews)
        distance_from_center_list.append(distance_from_center)
        stars_count_list.append(stars_count)
        new_hotel_list.append(new_hotel)
        check_in_list.append(checkin_date)

    return 

In this function we get the current day's current number of pages, and use a thread for each page in our current day.

In [5]:
def get_data_by_date(current_url,checkin_date):
        offset = 0
        ## https://stackoverflow.com/questions/27652543/how-to-use-python-requests-to-fake-a-browser-visit-a-k-a-and-generate-user-agent ## 
        header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'
                 , 'cookie' : '_pxhd=xbsf0AilS3%2FsN5aktJGHeB3e%2FApPn7gC0UZkzeBS77zSVfAOb-Cit-TPVhFdZsISdxeUleLAaDOLfV%2Feb4X9JA%3D%3D%3AmlkePGOdumzsURV0mqjXpGCmoAIhCmK6R1Cog5AAxrNdumJgBe7wgHqcjMS4Gq3HqkeXBLuxcPYuXqsAONpN-twu23o-KuXFSS6nRDDDbXk%3D; cors_js=1; _scid=2ad41565-bf13-4c03-bf92-64bccd96906a; _pxvid=5074dd7b-9adb-11ec-b46f-49675271437a; _pin_unauth=dWlkPVl6RXhaakV5WlRFdFlqSTVOeTAwTmpGaUxUaGpZVFF0WVdOaFpXUmxZekEwTkRCaw; FPID=FPID2.2.kPraKQ2KIBfEtDySAacKMvu1TSssQ24deUQKGybapCk%3D.1646302728; aff_ga=GA1.2.281928406.1648753197; _ga_A12345=GS1.1.1648805210.7.1.1648805210.0; _px_f394gi7Fvmc43dfg_user_id=ODlmZDdlOTAtZTczNC0xMWVjLTkxZjAtNmQ2OWI5MzMyNTQ1; bkng_sso_ses=eyJib29raW5nX2dsb2JhbCI6W3siaCI6InVzU2FTZzBJcUJIZ1kyQzhZRlp6OVptUEpkMGs4aC9kSklGVzZYazloSEEifV19; OptanonConsent=implicitConsentCountry=nonGDPR&implicitConsentDate=1667212467797; bkng_sso_session=e30; bkng_expired_hint=eyJib29raW5nX2dsb2JhbCI6W3sibG9naW5faGludCI6MzE0Njg4OTk2Mn1dfQ; xp=02UmFuZG9tSVYkc2RlIyh9YbxZGyl9Y5%2BPQFPglPqiINJId29VBxjPybcU00c7WPkeYqmKOuslTlk%3D; _pxhd=AVXMM806WAhQpBFogWFfIBrcxz6ZsyFy7mLr1Ty9FaFmPbljgxDESUfHUJqpm-L8vFZLU9pjCNsJtgMoosNifw%253D%253D%253Ais1OJldP84dI57KjI36A72Lqphp97JG%252FcUJa569TlfsbFs9wCRfRIxC4TMjYfyNscTISGYh3VpqIARrFjxu0XcdXaE1VDqxM6JiGm7VO5Xc%253D; BJS=-; _gid=GA1.2.249689016.1676489102; pxcts=721c4bc9-ad66-11ed-a523-764973546669; _gcl_au=1.1.1244434480.1676489104; bkng_prue=1; _schn=_axn3lw; _sctr=1|1676412000000; px_init=0; 11_srd=%7B%22features%22%3A%5B%7B%22id%22%3A9%7D%5D%2C%22score%22%3A3%2C%22detected%22%3Afalse%7D; _uetsid=72a86bc0ad6611eda6ece9e65fc29494; _uetvid=e1cc3ae0b8c211eb86fb4b3fd084dab8; _ga=GA1.1.1361045788.1646302728; _ga_FPD6YLJCJ7=GS1.1.1676506269.55.1.1676507713.0.0.0; _derived_epik=dj0yJnU9c2FuSWo3b2wwX2FkQ0xpUXMxR25haVdKMEJVVkdndFQmbj1DbmJrQk03emNvQ2I0Mlo5OW9vZzd3Jm09MSZ0PUFBQUFBR1B0ZWtFJnJtPTEmcnQ9QUFBQUFHUHRla0Umc3A9NQ; bkng=11UmFuZG9tSVYkc2RlIyh9Yaa29%2F3xUOLbmlZgMctCRAZJFbtYv%2FHNakS7Yb%2BKYGT2cCFsZgqr1VeRZRUwFGG0ujoZK8cwZaxR7fF1mvZ4a9tcy9taSL%2FDcJtuNNCtEulAI%2BPFmhtJkf%2F5K96%2BwgPBxr3hidOHtvcGGBzPeNuKuU0hrFHBJpvBPKXIwbEKpf2pg8jmiJz8AgtcIgWEKksLVyPh0nmBy3BPJfnL3IVmuiODRC6gT4Szdw%3D%3D; lastSeen=0; _px3=26f6642d4846754c3daa5d89a922d8c5f268f7308692a4b4fc027732800461ec:06yQNYpR0HHRWEtnQi1KW8uTLcvvnX1gs8tNF4hnNqH2yTEsInf5DN0PmV9jbcoBSif1f10/tfDNg7BHvcqMPA==:1000:9chl59KgPAoKXdKwUjdXuxHk6ZdsRrJKghF+r3PhoCrBtRJMEQonBUO5DSYk92AZWj8aU7oImNVgwVC5AWObdEAlAfEQb+DxfADr0NrEDEhNXkFQ1sQkVKsZ4Sg8fXR/MMvUaji8iRElwYRlR7O5qIp7uWfQKmYF1EYjLPvFW+eVEeVKxzsuLJiMQTDpTdcGqdKPE6bkHLoNUXuoepMmjA==; _pxde=e9020222818cc6c6ebb62d6593522b9474b92ec9afeba335643c782e6bce756f:eyJ0aW1lc3RhbXAiOjE2NzY1NDI4MTczMzAsImZfa2IiOjAsImlwY19pZCI6W119'
                 , 'accept-language' : 'he-IL,he;q=0.9,en-US;q=0.8,en;q=0.7'
                 }
        
        
        url = current_url + "&offset=" + str(offset)
        requests.get(url)
        response=requests.get(url, headers=header)
        
        soup = BeautifulSoup(response.content,'html.parser')
    
        # Get the number of hotels to know how many pages we have in the current day
        number_of_hotels = int(re.findall("\d+",soup.find('div', {'class': 'efdb2b543b'}).text)[0])
        number_of_pages = int(number_of_hotels/25)  + (number_of_hotels % 25 > 0)

        url_pages_data_list_for_threads = [] 

        # Get the url's we need for the current day according to the number of pages we have.
        for i in range(0,number_of_pages):
            url_pages_data_list_for_threads.append([url,offset,checkin_date,header])
            offset = offset + 25
            url = current_url + "&offset=" + str(offset)
            
        threads = [] 
        
        # For each page, create a thread and start it, then wait for all of them to stop together with the .join() command.
        for unit in url_pages_data_list_for_threads:
            url,offset,checkin_date,header = unit

            thread = Thread(target=get_data_from_page, args=(url,offset,checkin_date,header))
            threads.append(thread)
            
        for thread in threads:
            thread.start()

        for thread in threads:
            thread.join()
        
        return 

In this function, we calculate the needed dates according to the number of days we need, and then we get the information for each day using a thread.

In [6]:
def crawling(days):

    ## https://stackoverflow.com/questions/32490629/getting-todays-date-in-yyyy-mm-dd-in-python
    now = datetime.date.today()
    next_date  = now + datetime.timedelta(days = (days - 1))

    # https://stackoverflow.com/questions/1060279/iterating-through-a-range-of-dates-in-python
    daterange = pd.date_range(now, next_date)
    url_list = []
    
    ## Collecting all of the url's from first day until the destination day
    for single_date in daterange:
        checkin_date = single_date.strftime("%Y-%m-%d")
        checkout_date =  str(single_date.date() +  datetime.timedelta(days=1))
        url = 'https://www.booking.com/searchresults.he.html?label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4ArnTtZ8GwAIB0gIkNTRlM2Y0YzUtODA4Ny00MDUyLWFiN2ItNGJmOGQ5YTg2NDJm2AIF4AIB&sid=f5a19a8a686a469519274b1b081f63b3&aid=304142&ss=%D7%90%D7%99%D7%9C%D7%AA%2C+%D7%99%D7%A9%D7%A8%D7%90%D7%9C&efdco=1&lang=he&src=index&checkin='+checkin_date+'&checkout='+checkout_date+'&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure&nflt=ht_id%3D204&property_type=hotel'
        url_list.append([url,checkin_date])

    # https://www.pythontutorial.net/python-concurrency/python-threading/    
    threads = [] 

    for unit in url_list:
        url,checkin_date = unit
        thread = Thread(target=get_data_by_date, args=(url,checkin_date))
        threads.append(thread)
        
    i= 1 
    start_time = datetime.datetime.now() 
    for thread in threads:      
        print("Starting thread number "+ str(i) )
        thread.start()
        time.sleep(0.1)
        i = i+1
        
    j = 1     
    for thread in threads:      
        thread.join()  
        print("Finish thread number  "+ str(j))
        j= j+1  
    
    end_time = datetime.datetime.now()
    total_time = (end_time - start_time).seconds
    
    df = pd.DataFrame({
            "check_in" : check_in_list,
            "hotel_name" : hotel_name_list, 
            "available_rooms" : available_rooms_list,
            "hotel_price" : hotel_price_list,
            "hotel_rating" : hotel_rating_list,
            "preferred_hotel" : preferred_hotel_list, 
            "number_of_reviews" : number_of_reviews_list,
            "distance_from_center" : distance_from_center_list, 
            "stars_count" : stars_count_list,
            "new_hotel" : new_hotel_list,
            })    
    
    df.drop_duplicates(inplace=True)
    df2 = df.sort_values('check_in')
    
    df2 = df.iloc[:, :].reset_index(drop=True)
    
    print(f'Crawling has finished. Total time for crawling {days} days: ' + str(total_time) + "seconds.")

    return df2

In [7]:
if __name__ == "__main__":
    
    check_in_list = []
    hotel_name_list = [] 
    available_rooms_list = [] 
    hotel_price_list = [] 
    hotel_rating_list = []
    preferred_hotel_list = [] 
    number_of_reviews_list = [] 
    distance_from_center_list = [] 
    stars_count_list = [] 
    new_hotel_list = []
    
    # Crawl for the needed amount of days starting from today, and save all of the information in our empty lists above.
    df = crawling(365)
    
    df.to_csv('./CsvFolder/Hotels365Day.csv')

Starting thread number 1
Starting thread number 2
Starting thread number 3
Starting thread number 4
Starting thread number 5
Starting thread number 6
Starting thread number 7
Starting thread number 8
Starting thread number 9
Starting thread number 10
Starting thread number 11
Starting thread number 12
Starting thread number 13
Starting thread number 14
Starting thread number 15
Starting thread number 16
Starting thread number 17
Starting thread number 18
Starting thread number 19
Starting thread number 20
Starting thread number 21
Starting thread number 22
Starting thread number 23
Starting thread number 24
Starting thread number 25
Starting thread number 26
Starting thread number 27
Starting thread number 28
Starting thread number 29
Starting thread number 30
Starting thread number 31
Starting thread number 32
Starting thread number 33
Starting thread number 34
Starting thread number 35
Starting thread number 36
Starting thread number 37
Starting thread number 38
Starting thread numbe