In [2]:
from bs4 import BeautifulSoup
import requests

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.select import Select

import time
import datetime
import numpy as np

import pandas as pd


In [3]:
options = Options()
options.add_experimental_option("detach", True)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


In [34]:
hotels_df = pd.DataFrame(columns=['Snapshot',
                                  'Index',
                                  'Hotel Name',
                                  'TTT',
                                  'LOS',
                                  'Grade',
                                  'Num of Reviews',
                                  'Curr Price',
                                  'Original Price',
                                  'Percentage of discount',
                                  'Distance from center',
                                  'Type of room',
                                  'Location grade',
                                  'Is refundable',
                                  'Late payment',
                                  'Breakfast included',
                                  'Option Member'])

temp_df = hotels_df.copy()
 

In [37]:
def hotels_to_df(hotels_list,df,start_date, end_date):
    for i in range(0,120):
            try:   
                df.at[i, 'Snapshot'] = pd.Timestamp.today()
                df.loc[i, 'Index'] = i
                df.at[i, 'Hotel Name']  = hotels_list[i].find('h4').text
                df.at[i, 'TTT'] = (start_date - datetime.date.today()).days
                df.at[i, 'LOS'] = (end_date - start_date).days
                grade_element = hotels_list[i].find('span', class_='uitk-spacing uitk-spacing-padding-inlineend-half uitk-layout-flex-item')
                df.at[i, 'Grade'] = grade_element.text if grade_element is not None else np.nan
                reviews_element = hotels_list[i].find_all('span', class_="uitk-text uitk-type-300 uitk-text-default-theme")
                df.at[i, 'Num of Reviews'] = reviews_element[-1].text if len(reviews_element) > 0 else np.nan
                prices_list = hotels_list[i].find_all('div', class_="uitk-text uitk-type-300 uitk-text-default-theme is-visually-hidden")
                df.at[i, 'Curr Price'] = prices_list[1].text if len(prices_list) > 1 else prices_list[0].text
                df.at[i, 'Original Price'] = prices_list[0].text if len(prices_list) > 1 else np.nan
                df.at[i, 'Percentage of discount'] = 0
                df.at[i , 'Distance from center'] = 0
                df.at[i, 'Type of room'] = 0
                df.at[i, 'Location grade'] = 0
                refundable_element = hotels_list[i].find('div', class_="uitk-layout-flex uitk-layout-flex-flex-direction-column uitk-layout-flex-gap-three").find_all('span')
                df.at[i, 'Is refundable'] = True if len(refundable_element) > 0 and refundable_element[0].text.__contains__('refundable') else False
                late_payment_element = hotels_list[i].find('div', class_="uitk-layout-flex uitk-layout-flex-flex-direction-column uitk-layout-flex-gap-three").find_all('span')
                df.at[i, 'Late payment'] = True if len(late_payment_element) > 0 and late_payment_element[-1].text.__contains__('later') else False
                included_element = hotels_list[i].find('div', class_="uitk-text truncate uitk-type-200 uitk-text-default-theme")
                df.at[i, 'Breakfast included'] = included_element.text if included_element is not None else np.nan
                member_btn = hotels_list[i].find('a', class_ = 'uitk-button uitk-button-small uitk-button-has-text uitk-button-as-link uitk-layout-flex-item-align-self-flex-end uitk-layout-flex-item uitk-button-primary-alt')
                df.at[i, 'Option Member'] = True if member_btn else False
            except:
                print('error at hotel number: ' , i)
    
    return df
    
    

In [5]:
def open_url(url):
    driver.get(url)
    ScrollNumber = 4
    for i in range(1,ScrollNumber):
        driver.execute_script("window.scrollTo(1,5000000)")
        show_more_btn = driver.find_element("xpath","//button[@data-stid = 'show-more-results']")
        if show_more_btn:
            driver.execute_script("arguments[0].click();", show_more_btn)
        else: print("Button not found")
        time.sleep(5)

In [6]:
def get_hotels_list():
    page_source = driver.page_source
    doc = BeautifulSoup(page_source, 'html.parser')
    hotels_list = doc.find_all('div', class_='uitk-card uitk-card-roundcorner-all uitk-card-has-primary-theme')
    return hotels_list

In [7]:
def get_data_from_dates(start_date , end_date, temp_df, hotels_df):
    url = "https://euro.expedia.net/Hotel-Search?adults=2&d1={start_date}&d2={end_date}&destination=New%20York%20%28and%20vicinity%29%2C%20New%20York%2C%20United%20States%20of%20America&endDate={end_date}&latLong=40.75668%2C-73.98647&regionId=178293&rooms=1&semdtl=&sort=RECOMMENDED&startDate={start_date}&theme=&useRewards=false&userIntent=".format(start_date=start_date, end_date=end_date)
    open_url(url)
    hotels_list = get_hotels_list()
    temp_df = hotels_to_df(hotels_list,temp_df,start_date, end_date)
    return pd.concat([hotels_df, temp_df], ignore_index=True, axis=0)
    

In [None]:
for i in range(1,31):
    for j in range(1,6):
        try:
            start_date = datetime.date.today() + datetime.timedelta(days = i)
            end_date = start_date + datetime.timedelta(days = j)
            hotels_df = get_data_from_dates(start_date, end_date, temp_df, hotels_df)
            temp_df.drop(temp_df.index, inplace=True)
        except:
            print("error at dates: {} - {}".format(start_date , end_date))



In [None]:
hotels_df.to_csv(r'C:\Users\yarin\Desktop\hotels.csv',header=True, index=False, encoding='utf-8')


In [None]:
last_output_df = hotels_df.copy()