In [82]:
from bs4 import BeautifulSoup
import requests
import os

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.select import Select

import time
import datetime
import numpy as np

import pandas as pd


In [53]:
options = Options()
options.add_experimental_option("detach", True)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


[WDM] - Downloading: 100%|██████████| 6.78M/6.78M [00:01<00:00, 4.62MB/s]


In [55]:
hotels_df = pd.DataFrame(columns=['Snapshot',
                                  'Index',
                                  'Hotel Name',
                                  'TTT',
                                  'LOS',
                                  'Grade',
                                  'Num of Reviews',
                                  'Curr Price',
                                  'Original Price',
                                  'Percentage of discount',
                                  'Distance from center',
                                  'Type of room',
                                  'Location grade',
                                  'Is refundable',
                                  'Late payment',
                                  'Breakfast included',
                                  'Option Member'])

temp_df = hotels_df.copy()
 

In [56]:
def hotels_to_df(hotels_list,df,start_date, end_date):
    for i in range(0,120):
            try:   
                df.at[i, 'Snapshot'] = pd.Timestamp.today()
                df.loc[i, 'Index'] = i
                df.at[i, 'Hotel Name']  = hotels_list[i].find('h4').text
                df.at[i, 'TTT'] = (start_date - datetime.date.today()).days
                df.at[i, 'LOS'] = (end_date - start_date).days
                grade_element = hotels_list[i].find('span', class_='uitk-spacing uitk-spacing-padding-inlineend-half uitk-layout-flex-item')
                df.at[i, 'Grade'] = grade_element.text if grade_element is not None else np.nan
                reviews_element = hotels_list[i].find_all('span', class_="uitk-text uitk-type-300 uitk-text-default-theme")
                df.at[i, 'Num of Reviews'] = reviews_element[-1].text if len(reviews_element) > 0 else np.nan
                prices_list = hotels_list[i].find_all('div', class_="uitk-text uitk-type-300 uitk-text-default-theme is-visually-hidden")
                df.at[i, 'Curr Price'] = prices_list[1].text if len(prices_list) > 1 else prices_list[0].text
                df.at[i, 'Original Price'] = prices_list[0].text if len(prices_list) > 1 else np.nan
                df.at[i, 'Percentage of discount'] = 0
                df.at[i , 'Distance from center'] = 0
                df.at[i, 'Type of room'] = 0
                df.at[i, 'Location grade'] = 0
                refundable_element = hotels_list[i].find('div', class_="uitk-layout-flex uitk-layout-flex-flex-direction-column uitk-layout-flex-gap-three").find_all('span')
                df.at[i, 'Is refundable'] = True if len(refundable_element) > 0 and refundable_element[0].text.__contains__('refundable') else False
                late_payment_element = hotels_list[i].find('div', class_="uitk-layout-flex uitk-layout-flex-flex-direction-column uitk-layout-flex-gap-three").find_all('span')
                df.at[i, 'Late payment'] = True if len(late_payment_element) > 0 and late_payment_element[-1].text.__contains__('later') else False
                included_element = hotels_list[i].find('div', class_="uitk-text truncate uitk-type-200 uitk-text-default-theme")
                df.at[i, 'Breakfast included'] = included_element.text if included_element is not None else np.nan
                member_btn = hotels_list[i].find('a', class_ = 'uitk-button uitk-button-small uitk-button-has-text uitk-button-as-link uitk-layout-flex-item-align-self-flex-end uitk-layout-flex-item uitk-button-primary-alt')
                df.at[i, 'Option Member'] = True if member_btn else False
            except:
                print('error at hotel number: ' , i)
    
    return df
    
    

In [58]:
def open_url(url):
    driver.get(url)
    ScrollNumber = 4
    for i in range(1,ScrollNumber):
        driver.execute_script("window.scrollTo(1,5000000)")
        show_more_btn = driver.find_element("xpath","//button[@data-stid = 'show-more-results']")
        if show_more_btn:
            driver.execute_script("arguments[0].click();", show_more_btn)
        else: print("Button not found")
        time.sleep(5)

In [59]:
def get_hotels_list():
    page_source = driver.page_source
    doc = BeautifulSoup(page_source, 'html.parser')
    hotels_list = doc.find_all('div', class_='uitk-card uitk-card-roundcorner-all uitk-card-has-primary-theme')
    return hotels_list

In [60]:
def get_data_from_dates(start_date , end_date, temp_df, hotels_df):
    url = "https://euro.expedia.net/Hotel-Search?adults=2&d1={start_date}&d2={end_date}&destination=New%20York%20%28and%20vicinity%29%2C%20New%20York%2C%20United%20States%20of%20America&endDate={end_date}&latLong=40.75668%2C-73.98647&regionId=178293&rooms=1&semdtl=&sort=RECOMMENDED&startDate={start_date}&theme=&useRewards=false&userIntent=".format(start_date=start_date, end_date=end_date)
    open_url(url)
    hotels_list = get_hotels_list()
    temp_df = hotels_to_df(hotels_list,temp_df,start_date, end_date)
    return pd.concat([hotels_df, temp_df], ignore_index=True, axis=0)
    

In [61]:
for i in range(1,31):
    for j in range(1,6):
        try:
            start_date = datetime.date.today() + datetime.timedelta(days = i)
            end_date = start_date + datetime.timedelta(days = j)
            hotels_df = get_data_from_dates(start_date, end_date, temp_df, hotels_df)
            temp_df.drop(temp_df.index, inplace=True)
        except:
            print("error at dates: {} - {}".format(start_date , end_date))



In [62]:
last_output_df = hotels_df.copy()

In [63]:
last_output_df.shape

(18000, 17)

In [69]:
last_output_df.head()

Unnamed: 0,Snapshot,Index,Hotel Name,TTT,LOS,Grade,Num of Reviews,Curr Price,Original Price,Percentage of discount,Distance from center,Type of room,Location grade,Is refundable,Late payment,Breakfast included,Option Member
0,2023-03-08 22:04:09.124030,0,Hotel 32 32,1,1,8.0,"(1,290 reviews)",The price is €218,The price was €281,0,0,0,0,False,False,,False
1,2023-03-08 22:04:09.130488,1,Hyatt House Jersey City,1,1,8.4,"(1,000 reviews)",The price is €246,The price was €273,0,0,0,0,False,False,Breakfast included,False
2,2023-03-08 22:04:09.135054,2,Sonesta Simply Suites Jersey City,1,1,8.4,(999 reviews),The price is €209,,0,0,0,0,True,False,,False
3,2023-03-08 22:04:09.140770,3,Four Points By Sheraton New York Downtown,1,1,7.4,"(1,003 reviews)",The price is €127,,0,0,0,0,False,False,,False
4,2023-03-08 22:04:09.143762,4,Hilton Garden Inn NYC Financial Center/Manhatt...,1,1,8.6,"(1,002 reviews)",The price is €150,The price was €176,0,0,0,0,True,False,,True


In [70]:
file_name = "Expedia_hotels.csv"
if os.path.exists(file_name):#if the "Expedia_hotels.csv" exists
    print("exists")
    last_output_df.to_csv(file_name,mode = 'a' , index= False ,header=False)#append without the header
else:
    print("new")
    last_output_df.to_csv(file_name,mode = 'a' , index= False ,header=True)#append with the header

In [47]:
last_output_df.groupby('LOS').count()

Unnamed: 0_level_0,Snapshot,Index,Hotel Name,TTT,Grade,Num of Reviews,Curr Price,Original Price,Percentage of discount,Distance from center,Type of room,Location grade,Is refundable,Late payment,Breakfast included,Option Member
LOS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,3600,3600,3600,3600,3560,3560,3600,928,3600,3600,3600,3600,3600,3600,586,3600
2,3600,3600,3600,3600,3566,3566,3600,1070,3600,3600,3600,3600,3600,3600,606,3600
3,3600,3600,3600,3600,3569,3569,3600,1423,3600,3600,3600,3600,3600,3600,626,3600
4,3600,3600,3600,3600,3565,3565,3600,1535,3600,3600,3600,3600,3600,3600,630,3600
5,3580,3580,3580,3580,3551,3551,3580,1584,3580,3580,3580,3580,3580,3580,619,3580


In [51]:
last_output_df.groupby('TTT').count()

Unnamed: 0_level_0,Snapshot,Index,Hotel Name,LOS,Grade,Num of Reviews,Curr Price,Original Price,Percentage of discount,Distance from center,Type of room,Location grade,Is refundable,Late payment,Breakfast included,Option Member
TTT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,600,600,600,600,595,595,600,240,600,600,600,600,600,600,90,600
2,600,600,600,600,596,596,600,244,600,600,600,600,600,600,91,600
3,600,600,600,600,595,595,600,260,600,600,600,600,600,600,86,600
4,600,600,600,600,595,595,600,241,600,600,600,600,600,600,86,600
5,600,600,600,600,595,595,600,243,600,600,600,600,600,600,94,600
6,600,600,600,600,594,594,600,210,600,600,600,600,600,600,96,600
7,580,580,580,580,570,570,580,209,580,580,580,580,580,580,100,580
8,600,600,600,600,590,590,600,215,600,600,600,600,600,600,117,600
9,600,600,600,600,590,590,600,219,600,600,600,600,600,600,113,600
10,600,600,600,600,590,590,600,221,600,600,600,600,600,600,106,600
