In [None]:
#hotelId,reviewId,stars,date,review

import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver', options=chrome_options)
defaultTimeout = 30
defaultRetryWaitTime = 0.5

hotelData = pd.read_csv('hotelUrls.csv')
hotelFlag = [0] * len(hotelData)
hotelData['hotelFlag'] = hotelFlag

hotelReviews = pd.DataFrame(columns = ['hotelId', 'reviewId', 'stars', 'date', 'review'])

def timedRetryGetAttribute(element, attribute):
    startTime = time.time()
    finishTime = startTime + defaultTimeout

    firstTime = True

    while time.time() < finishTime:
        try:
            if firstTime == False:
                time.sleep(defaultRetryWaitTime)
            firstTime = False
            attr = element.get_attribute(attribute)
            return attr
        except Exception:
            continue
    return ""

def timedRetryGetAttributeContains(element, attribute, contains):
    startTime = time.time()
    finishTime = startTime + defaultTimeout

    firstTime = True

    while time.time() < finishTime:
        try:
            if firstTime == False:
                time.sleep(defaultRetryWaitTime)
            firstTime = False
            attr = element.get_attribute(attribute)
            if("none" in attr):
                return attr
        except Exception:
            continue
    return ""

def timedRetryClickOnCssClass(element, cssClass):
    startTime = time.time()
    finishTime = startTime + defaultTimeout

    firstTime = True

    while time.time() < finishTime:
        try:
            if firstTime == False:
                time.sleep(defaultRetryWaitTime)
            firstTime = False

            elementClickable = WebDriverWait(element, defaultTimeout).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, cssClass))
            )
            elementClickable.click()
            return
        except Exception:
            continue
    return ""

def getReviews(dataFrameIndex, hotelId, pageUrl, pageNumber):
    #pageUrl = https://www.tripadvisor.com.au/Hotel_Review-g255073-d255393-Reviews-Hotel_Grand_Chancellor_Townsville-Townsville_Queensland.html#REVIEWS

    fixedUrl = pageUrl

    if pageNumber != 1:
        orNum = (pageNumber - 1) * 5
        reviewsKeyword = "Reviews-"
        reviewKeywordIndex = pageUrl.index(reviewsKeyword) + len(reviewsKeyword) #68
        urlFirstPart = pageUrl[:reviewKeywordIndex] #https://www.tripadvisor.com.au/Hotel_Review-g255073-d255393-Reviews-
        orStr = "or" + str(orNum) + "-" #or5-
        urlSecondPart = pageUrl[reviewKeywordIndex:] #Hotel_Grand_Chancellor_Townsville-Townsville_Queensland.html
        fixedUrl = urlFirstPart + orStr + urlSecondPart #https://www.tripadvisor.com.au/Hotel_Review-g255073-d255393-Reviews-or5-Hotel_Grand_Chancellor_Townsville-Townsville_Queensland.html

    getPageRetryMax = 5
    currentTry = 1

    while True:
        try:
            if(currentTry > getPageRetryMax):
                print(f"Failed to retrieve reviews from hotelId {hotelId}, skipping")
                break

            print(f"Scraping reviews for hotelId {hotelId} on their page number {pageNumber}")

            hotelReviewsNew = []

            driver.get(fixedUrl)

            try:
                reviewElements = WebDriverWait(driver, defaultTimeout).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[class='cWwQK MC R2 Gi z Z BB dXjiy']"))
                )
            except TimeoutException:
                print(f"Could not see any reviews on hotelId {hotelId} on their {pageNumber} page")
                hotelData.loc[dataFrameIndex, 'hotelFlag'] = 1
                break

            #This will only trigger if a hotel has ZERO reviews in total
            if len(reviewElements) == 0:
                print(f"Hotel {hotelId} never had any reviews at all")
                hotelData.loc[dataFrameIndex, 'hotelFlag'] = 1
                return

            for idx, reviewElement in enumerate(reviewElements):
                #Get review id
                reviewId = (pageNumber - 1) * 5 + idx

                #Get review stars
                reviewStarParentElement = WebDriverWait(reviewElement, defaultTimeout).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='emWez F1']"))
                )

                reviewStarChildElement = WebDriverWait(reviewStarParentElement, defaultTimeout).until(
                    EC.presence_of_element_located((By.XPATH, './/*'))
                )

                reviewStarClass = timedRetryGetAttribute(reviewStarChildElement, "class") #ui_bubble_rating bubble_30
                reviewStars = reviewStarClass[-2:] #30

                #Get review date
                reviewDateElement = WebDriverWait(reviewElement, defaultTimeout).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "span[class='euPKI _R Me S4 H3']"))
                )

                reviewDateTextRaw = reviewDateElement.text #'Date of stay: November 2021'

                reviewDateTextRawColonIndex = reviewDateTextRaw.index(":") + len(":") + 1

                reviewDate = reviewDateTextRaw[reviewDateTextRawColonIndex:]

                #readMore
                reviewTextBlockElement = WebDriverWait(reviewElement, defaultTimeout).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='dovOW']"))
                )

                reviewTextBlockElementChildren = WebDriverWait(reviewTextBlockElement, defaultTimeout).until(
                    EC.presence_of_all_elements_located((By.XPATH, './*'))
                )

                for child in reviewTextBlockElementChildren:
                    childsClass = timedRetryGetAttribute(child, 'class')
                    if "duhwe _T bOlcm" not in childsClass:
                        continue
                    if "dMbup" in childsClass:
                        timedRetryClickOnCssClass(child, "span[class='eljVo _S Z']")
                        break

                #wait until the class 'duhwe _T bOlcm' is directly below dovOW, but dovOW refreshes as well
                expandedReviewTextBlockParentElement = WebDriverWait(reviewElement, defaultTimeout).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='dovOW']"))
                )

                try:
                    WebDriverWait(expandedReviewTextBlockParentElement, defaultTimeout).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='duhwe _T bOlcm']"))
                    )
                except Exception:
                    pass

                reviewTextElement = WebDriverWait(expandedReviewTextBlockParentElement, defaultTimeout).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "q[class='XllAv H4 _a']"))
                )

                reviewText = reviewTextElement.text.replace('\n', " ")

                hotelReviewsNew.append([hotelId, reviewId, reviewStars, reviewDate, reviewText])

            uiPaginationElement = None
            try:
                uiPaginationElement = WebDriverWait(driver, defaultTimeout).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='ui_pagination is-centered"))
                )
            except TimeoutException:
                hotelData.loc[dataFrameIndex, 'hotelFlag'] = 1
                print(f"Found {len(hotelReviewsNew)} new reviews for hotelId {hotelId}")
                print(f"Hotel {hotelId} only has one page of reviews")
                for hotelReviewNew in hotelReviewsNew:
                    hotelReviews.loc[len(hotelReviews)] = hotelReviewNew
                break

            uiPaginationElementChildren = WebDriverWait(uiPaginationElement, defaultTimeout).until(
                EC.presence_of_all_elements_located((By.XPATH, './*'))
            )

            for childElement in uiPaginationElementChildren:
                childElementClass = timedRetryGetAttribute(childElement, "class")
                isNextButton = "next" in childElementClass
                if isNextButton == False:
                    continue
                isNextDisabled = "disabled" in childElementClass
                if isNextDisabled == True:
                    print(f"Hotel {hotelId} does not have another page of reviews")
                    hotelData.loc[dataFrameIndex, 'hotelFlag'] = 1

            print(f"Found {len(hotelReviewsNew)} new reviews for hotelId {hotelId}")
            for hotelReviewNew in hotelReviewsNew:
                hotelReviews.loc[len(hotelReviews)] = hotelReviewNew
            break
        except Exception as error:
            print(f"Failed to retrieve reviews from hotelId {hotelId} on pageNumber {pageNumber}")
            currentTry = currentTry + 1


def getReviewsWithPageNumber(pageNumber):
    for index, row in hotelData.iterrows():
        hotelId = row['hotelId']
        pageUrl = row['url']
        hotelFlag = row['hotelFlag']
        if hotelFlag == 0:
            getReviews(index, hotelId, pageUrl, pageNumber)
            hotelReviews.to_csv("hotelReviews.csv", index = False, header = True)
            hotelData.to_csv("hotelReviews-flags.csv", index = False, header = True)

def anyHotelsToCheck():
    return 0 in hotelData['hotelFlag'].values

pageNumber = 1

while anyHotelsToCheck():
    print(f"Scraping each hotel page for reviews, looking at their page number {pageNumber}")
    getReviewsWithPageNumber(pageNumber)
    print(f"Finished scraping each hotel's page {pageNumber} for reviews, currently we have {len(hotelReviews)} reviews")
    pageNumber = pageNumber + 1

print("No more hotel reviews to scan")

driver.quit()