In [54]:
import json
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup

def scrape_booking_hotels(url, hotels_with_pool, hotels_with_restaurant, hotels_with_parking, hotels_with_wheelchair, hotels_with_view):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    hotels = []

    hotel_cards = soup.find_all("div", class_="c82435a4b8 a178069f51 a6ae3c2b40 a18aeea94d d794b7a0f7 f53e278e95 c6710787a4")
    for card in hotel_cards:
        hotel_name = card.find("div", class_="f6431b446c a15b38c233").text.strip()
        hotel_location = card.find("span", class_="aee5343fdb def9bc142a").text.strip()
        
        # hotel score
        hotel_score_div = card.find('div', class_='a3b8729ab1')
        hotel_score = hotel_score_div.text.strip() if hotel_score_div else None 
        
        # hotel reviews
        hotel_reviews_div = card.find('div', class_='abf093bdfe f45d8e4c32 d935416c47')
        hotel_reviews = hotel_reviews_div.text.strip() if hotel_reviews_div else None     
        
        # Handle hotel price
        hotel_price = card.find('span', class_='f6431b446c fbfd7c1165 e84eb96b1f').text.strip() if card.find('span', class_='f6431b446c fbfd7c1165 e84eb96b1f').text.strip() else None
        
        beds_info = card.find('h4', class_='abf093bdfe e8f7c070a7').text.strip() if card.find('div', class_='abf093bdfe') else None
        rating_div = card.find("div", {"data-testid": "rating-stars", "aria-hidden": "true"})
        rating_span_count = len(rating_div.find_all("span", class_="fcd9eec8fb d31eda6efc c25361c37f")) if rating_div else None

        # Check if the hotel is in the list of hotels with facilities
        pool = 'yes' if hotel_name in hotels_with_pool else 'no'
        restaurant = 'yes' if hotel_name in hotels_with_restaurant else 'no'
        parking = 'yes' if hotel_name in hotels_with_parking else 'no'
        wheelchair = 'yes' if hotel_name in hotels_with_wheelchair else 'no'
        view = 'yes' if hotel_name in hotels_with_view else 'no'


        hotels.append({"Hotel Name": hotel_name, "Address": hotel_location, "Score": hotel_score, "Number of reviews": hotel_reviews, "Price": hotel_price, "Room type": beds_info, "Star rating": rating_span_count, "Room view": view, "Swimming pool": pool, "Restaurant": restaurant, "Parking": parking, "Wheelchair accessibility": wheelchair})

    return hotels

def scrape_and_save_hotels(search_url, hotels_with_pool, hotels_with_restaurant, hotels_with_parking, hotels_with_wheelchair, hotels_with_view):
    all_hotels = []

    try:
        for i in range(1, 28):
            hotels_on_page = scrape_booking_hotels(search_url + "&offset=" + str(i * 25), hotels_with_pool, hotels_with_restaurant, hotels_with_parking, hotels_with_wheelchair, hotels_with_view)
            all_hotels.extend(hotels_on_page)
    except KeyboardInterrupt:
        print("Execution interrupted by user.")

    return all_hotels

def save_to_csv(data, file_path):
    df = pd.DataFrame(data)
    df.to_csv(file_path, index=False)
    print("Data saved to:", file_path)

if __name__ == "__main__":
    # Load hotels with swimming pool from the external Excel file
    hotels_with_pool_df = pd.read_excel('hotels_with_pool.xlsx')
    hotels_with_pool = hotels_with_pool_df['names'].tolist()

    # Load hotels with Restaurant from the external Excel file
    hotels_with_restaurant_df = pd.read_excel('hotels_with_restaurant.xlsx')
    hotels_with_restaurant = hotels_with_restaurant_df['names'].tolist()
 
    # Load hotels with parking from the external Excel file
    hotels_with_parking_df = pd.read_excel('hotels_with_parking.xlsx')
    hotels_with_parking = hotels_with_parking_df['names'].tolist()
    
    # Load hotels with wheel chair from the external Excel file
    hotels_with_wheelchair_df = pd.read_excel('hotels_with_wheelchair.xlsx')
    hotels_with_wheelchair = hotels_with_wheelchair_df['names'].tolist()  
    
    # Load hotels with view from the external Excel file
    hotels_with_view_df = pd.read_excel('hotels_with_view.xlsx')
    hotels_with_view = hotels_with_view_df['names'].tolist()   
    
    search_url = 'https://www.booking.com/searchresults.en-gb.html?label=en-sa-booking-desktop-UmcGSHJHH5d7CxRJ6pHCYgS652796015661%3Apl%3Ata%3Ap1%3Ap2%3Aac%3Aap%3Aneg%3Afi%3Atikwd-65526620%3Alp1012088%3Ali%3Adec%3Adm&sid=d7c87de715469e394879757ca2dd2e2d&aid=2311236&ss=Saudi+Arabia&ssne=Saudi+Arabia&ssne_untouched=Saudi+Arabia&efdco=1&lang=en-gb&src=searchresults&dest_id=186&dest_type=country&checkin=2024-06-03&checkout=2024-06-08&ltfd=1%3A5%3A5-2024_6-2024_7-2024%3A1%3A&group_adults=2&no_rooms=1&group_children=0&nflt=ht_id%3D204'
    
    hotels_data = scrape_and_save_hotels(search_url, hotels_with_pool, hotels_with_restaurant, hotels_with_parking, hotels_with_wheelchair, hotels_with_view)

    csv_file_path = "Saudi_Arabia_hotels.csv"
    save_to_csv(hotels_data, csv_file_path)
    

Data saved to: Saudi_Arabia_hotels.csv


In [55]:
    # To check if hotel is relevant with pools
    KSA_hotels = pd.read_csv('Saudi_Arabia_hotels.csv')
    pool_hotel = pd.read_excel('hotels_with_pool.xlsx')

    hotels = ['Hotel Name', 'Score', 'Star rating']
    
    pool_hotel.rename(columns={'names': 'Hotel Name'}, inplace=True)

    pool_attributes_hotel_ratings = KSA_hotels[hotels]
    hotel_pool = pd.merge(pool_attributes_hotel_ratings, pool_hotel, on='Hotel Name', how='inner')

    hotel_pool = hotel_pool.drop_duplicates()
    hotel_pool.to_csv('hotel_pool.csv', index=False)
    print(hotel_pool)

                                             Hotel Name          Score  \
0                                         Novotel Jazan  8.7Scored 8.7   
208                                      Braira Al Ahsa  8.9Scored 8.9   
418                                         Doolv Hotel  8.1Scored 8.1   
642                            Central Park Hotel Bisha  7.8Scored 7.8   
850                                  Holiday Villa Hail  8.1Scored 8.1   
...                                                 ...            ...   
1933           InterContinental Al Khobar, an IHG Hotel  8.0Scored 8.0   
1934                                      Emerald Hotel  8.2Scored 8.2   
1935                        Spectrums Al Salamah Jeddah  8.1Scored 8.1   
1937                                         Ibis Yanbu  7.3Scored 7.3   
1938  Riyadh Diplomatic Quarter - Marriott Executive...  8.2Scored 8.2   

      Star rating  
0             5.0  
208           4.0  
418           3.0  
642           3.0  
850        

In [56]:
    # To check if hotel is relevant with restaurants
    KSA_hotels = pd.read_csv('Saudi_Arabia_hotels.csv')
    restaurant_hotel = pd.read_excel('hotels_with_restaurant.xlsx')

    hotels = ['Hotel Name', 'Score', 'Star rating']
    
    restaurant_hotel.rename(columns={'names': 'Hotel Name'}, inplace=True)

    restaurant_hotel_ratings = KSA_hotels[hotels]
    restaurant_hotel = pd.merge(restaurant_hotel_ratings, restaurant_hotel, on='Hotel Name', how='inner')

    restaurant_hotel = restaurant_hotel.drop_duplicates()
    restaurant_hotel.to_csv('restaurant_hotel.csv', index=False)
    print(restaurant_hotel)


                                             Hotel Name          Score  \
0                                         Novotel Jazan  8.7Scored 8.7   
192                                      Braira Al Ahsa  8.9Scored 8.9   
357                            Caravan by Habitas AlUla  9.2Scored 9.2   
537                                    Sendan Residence  8.0Scored 8.0   
663                            Central Park Hotel Bisha  7.8Scored 7.8   
...                                                 ...            ...   
2048                        Spectrums Al Salamah Jeddah  8.1Scored 8.1   
2050                                   Manazil Al Dhayf  7.9Scored 7.9   
2051                          Warwick Riyadh Al Wezarat  8.8Scored 8.8   
2052                                         Ibis Yanbu  7.3Scored 7.3   
2053  Riyadh Diplomatic Quarter - Marriott Executive...  8.2Scored 8.2   

      Star rating  
0             5.0  
192           4.0  
357           5.0  
537           NaN  
663        

In [57]:
    # To check if hotel is relevant with parking
    KSA_hotels = pd.read_csv('Saudi_Arabia_hotels.csv')
    parking_hotel = pd.read_excel('hotels_with_parking.xlsx')

    hotels = ['Hotel Name', 'Score', 'Star rating']
    
    parking_hotel.rename(columns={'names': 'Hotel Name'}, inplace=True)

    parking_hotel_ratings = KSA_hotels[hotels]
    parking_hotel = pd.merge(parking_hotel_ratings, parking_hotel, on='Hotel Name', how='inner')

    parking_hotel = parking_hotel.drop_duplicates()
    parking_hotel.to_csv('parking_hotel.csv', index=False)
    print(parking_hotel)

                                             Hotel Name          Score  \
0                      فندق ارجان بارك Arjan Park Hotel  9.1Scored 9.1   
120                                       Novotel Jazan  8.7Scored 8.7   
280          Mirada Gold Jizan - Resort & Private Pools  8.4Scored 8.4   
440                     فندق حصن الأبلق - Alablaq Hotel  8.4Scored 8.4   
575                              Lily Hotel Suite Hofuf  8.4Scored 8.4   
...                                                 ...            ...   
3109  Riyadh Diplomatic Quarter - Marriott Executive...  8.2Scored 8.2   
3111                        Sama Al Qasr Al Muhammadiah  7.6Scored 7.6   
3113                        فيفيندا أحلى مكان خميس مشيط  6.7Scored 6.7   
3115                    فندق ميرا بارك -Mera Park Hotel  6.9Scored 6.9   
3117                     Hyatt Jeddah Continental Hotel  7.9Scored 7.9   

      Star rating  
0             NaN  
120           5.0  
280           5.0  
440           NaN  
575        

In [58]:
    # To check if hotel is relevant with wheelchair
    KSA_hotels = pd.read_csv('Saudi_Arabia_hotels.csv')
    wheelchair_hotel = pd.read_excel('hotels_with_wheelchair.xlsx')

    hotels = ['Hotel Name', 'Score', 'Star rating']
    
    wheelchair_hotel.rename(columns={'names': 'Hotel Name'}, inplace=True)

    wheelchair_hotel_ratings = KSA_hotels[hotels]
    wheelchair_hotel = pd.merge(wheelchair_hotel_ratings, wheelchair_hotel, on='Hotel Name', how='inner')

    wheelchair_hotel = wheelchair_hotel.drop_duplicates()
    wheelchair_hotel.to_csv('wheelchair_hotel.csv', index=False)
    print(wheelchair_hotel)

                                             Hotel Name          Score  \
0                      فندق ارجان بارك Arjan Park Hotel  9.1Scored 9.1   
210                                       Novotel Jazan  8.7Scored 8.7   
450          Mirada Gold Jizan - Resort & Private Pools  8.4Scored 8.4   
706                              Lily Hotel Suite Hofuf  8.4Scored 8.4   
838                                         Doolv Hotel  8.1Scored 8.1   
1078                           Central Park Hotel Bisha  7.8Scored 7.8   
1302                                 Holiday Villa Hail  8.1Scored 8.1   
1484                                           WA Hotel  8.4Scored 8.4   
1666                            ibis Jeddah City Center  8.4Scored 8.4   
1922  La Cordia Hotel Apartment لاكورديا للشقق الفندقية  8.6Scored 8.6   
1935        Park Inn by Radisson Jubail Industrial City  8.3Scored 8.3   
2061             فندق ضيوف المقام للغرف والشقق المفروشة  8.1Scored 8.1   
2187         Ramada Hotel & Suites by 

In [59]:
    # To check if hotel is relevant with view
    KSA_hotels = pd.read_csv('Saudi_Arabia_hotels.csv')
    view_hotel = pd.read_excel('hotels_with_view.xlsx')

    hotels = ['Hotel Name', 'Score', 'Star rating']
    
    view_hotel.rename(columns={'names': 'Hotel Name'}, inplace=True)

    view_hotel_ratings = KSA_hotels[hotels]
    view_hotel = pd.merge(view_hotel_ratings, view_hotel, on='Hotel Name', how='inner')

    view_hotel = view_hotel.drop_duplicates()
    view_hotel.to_csv('view_hotel.csv', index=False)
    print(view_hotel)

                                  Hotel Name          Score  Star rating
0           فندق ارجان بارك Arjan Park Hotel  9.1Scored 9.1          NaN
180                            Novotel Jazan  8.7Scored 8.7          5.0
388                           Braira Al Ahsa  8.9Scored 8.9          4.0
568                 Caravan by Habitas AlUla  9.2Scored 9.2          5.0
748                 Central Park Hotel Bisha  7.8Scored 7.8          3.0
...                                      ...            ...          ...
2102                Lavent Park Hotel Suites  5.5Scored 5.5          3.0
2104                          Orans Suites 4  7.8Scored 7.8          NaN
2106  Msharef almoden hotel فندق مشارف المدن  7.9Scored 7.9          NaN
2108                              Kadi Hotel  7.7Scored 7.7          3.0
2109                JW Marriott Hotel Riyadh  7.8Scored 7.8          5.0

[91 rows x 3 columns]


In [60]:
# We noticed that the scores and star rating are not correlated to each other. Star ratings have many missing values. 
# Scores are the ratings given by the users themselves, therefore we chose to keep the scores and remove ratings

column_to_drop = 'Star rating' 
KSA_hotels.drop(columns=[column_to_drop], inplace=True)

KSA_hotels.to_csv('mod_KSA_hotels.csv', index=False)

In [61]:
# Preprocessing 'Score', 'Number of reviews', 'Price' column into float and integer

KSA_hotels = pd.read_csv('mod_KSA_hotels.csv')

# Preprocessing 'Score' column into float 
KSA_hotels['Score'] = KSA_hotels['Score'].str.slice(9).astype(float)

# Preprocessing 'Number of reviews' column into int
KSA_hotels['Number of reviews'] = KSA_hotels['Number of reviews'].str.slice(stop=-7).str.replace(',','').fillna(0).astype(int)

# Preprocessing 'Price' column into int
KSA_hotels['Price'] = KSA_hotels['Price'].str.slice(3).str.replace(',','').fillna(0).astype(int)
print(KSA_hotels)


                                      Hotel Name          Address  Score  \
0               فندق ارجان بارك Arjan Park Hotel  Wadi Al Dawasir    9.1   
1                                  Novotel Jazan            Jazan    8.7   
2     Mirada Gold Jizan - Resort & Private Pools            Jazan    8.4   
3                فندق حصن الأبلق - Alablaq Hotel           Taymāʼ    8.4   
4                         Lily Hotel Suite Hofuf         Al Hofuf    8.4   
..                                           ...              ...    ...   
670                        Aseel Hotel Apartment            Jazan    7.6   
671  Park Inn by Radisson Jubail Industrial City        Al Jubail    8.3   
672       فندق ضيوف المقام للغرف والشقق المفروشة           Makkah    8.1   
673                      Enala Hotel - Al Khobar        Al Khobar    8.8   
674                            Our Habitas AlUla            AlUla    9.3   

     Number of reviews  Price                          Room type Room view  \
0        

In [62]:
# Preprocessing 'Room view', 'Swimming pool', 'Restaurant', 'Parking', 'Wheelchair accessibility' by encoding them into 0s and 1s

KSA_hotels['Room view'] = KSA_hotels['Room view'].replace({'yes': 1, 'no': 0})
KSA_hotels['Swimming pool'] = KSA_hotels['Swimming pool'].replace({'yes': 1, 'no': 0})
KSA_hotels['Restaurant'] = KSA_hotels['Restaurant'].replace({'yes': 1, 'no': 0})
KSA_hotels['Parking'] = KSA_hotels['Parking'].replace({'yes': 1, 'no': 0})
KSA_hotels['Wheelchair accessibility'] = KSA_hotels['Wheelchair accessibility'].replace({'yes': 1, 'no': 0})

print(KSA_hotels)

                                      Hotel Name          Address  Score  \
0               فندق ارجان بارك Arjan Park Hotel  Wadi Al Dawasir    9.1   
1                                  Novotel Jazan            Jazan    8.7   
2     Mirada Gold Jizan - Resort & Private Pools            Jazan    8.4   
3                فندق حصن الأبلق - Alablaq Hotel           Taymāʼ    8.4   
4                         Lily Hotel Suite Hofuf         Al Hofuf    8.4   
..                                           ...              ...    ...   
670                        Aseel Hotel Apartment            Jazan    7.6   
671  Park Inn by Radisson Jubail Industrial City        Al Jubail    8.3   
672       فندق ضيوف المقام للغرف والشقق المفروشة           Makkah    8.1   
673                      Enala Hotel - Al Khobar        Al Khobar    8.8   
674                            Our Habitas AlUla            AlUla    9.3   

     Number of reviews  Price                          Room type  Room view  \
0       