### Social Media Analytics
# Webscraping demo
## Tripadvisor hotel reviews
### This notebook uses the Excel file *HotelsToScrap.xlsx* as the list of hotels to scrap
#### Changes may be required due to Tripadvisor's continous updates

(c) Nuno António 2020-2022 - Rev. 1.06

### Load packages and do the initializations

In [1]:
# Names and numbers of the students 
# Maria Neves 20210783 ; Francisco Dias 20210129; Matilde Baptista da Costa 20210894 ; Patricia Peixoto 20210741

In [2]:
# Load libraries
import numpy as np
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
import time
import re

In [3]:
# Allow not verified SSL (Secure Socket Layer) certificates to be opened
ssl._create_default_https_context = ssl._create_unverified_context

In [4]:
# Get Firefox options (configurations)
options = Options()

# Add this argument to Options to avoid being detected as a robot
# options.add_argument("--disable-blink-features")
# options.add_argument("--disable-blink-features=AutomationControlled")


# Add this argument to Options to hide Firefox (make it not visible)
options.add_argument('--headless') 

In [5]:
# Load the list of the hotels to read the content
hotelsToScrap = pd.read_excel("Hotels.xlsx", sheet_name="Sheet1", index_col="ID", engine='openpyxl')


In [6]:
# Create an empty dataframe for the resuls
hotelReviews = pd.DataFrame({'hotelID': pd.Series([], dtype='string'),
                             'user': pd.Series([], dtype='string'),
                             'rating': pd.Series([], dtype='float'),
                             'local': pd.Series([], dtype='string'),
                             'date': pd.Series([], dtype='string'),
                             'contributions': pd.Series([], dtype='string'),
                             'text': pd.Series([], dtype='string')
                            })

hotelStatisticsPrePandemic = pd.DataFrame({'HotelID': pd.Series([], dtype='float'),
                                           '02/2019': pd.Series([], dtype='float'),
                                           '03/2019': pd.Series([], dtype='float'),
                                           '04/2019': pd.Series([], dtype='float'),
                                           '05/2019': pd.Series([], dtype='float'),
                                           '06/2019': pd.Series([], dtype='float'),
                                           '07/2019': pd.Series([], dtype='float'),
                                           '08/2019': pd.Series([], dtype='float'),
                                           '09/2019': pd.Series([], dtype='float'),
                                           '10/2019': pd.Series([], dtype='float'),
                                           '11/2019': pd.Series([], dtype='float'),
                                           '12/2019': pd.Series([], dtype='float'),
                                           '01/2020': pd.Series([], dtype='float'),
                                           '02/2020': pd.Series([], dtype='float')
                                          })

hotelStatisticsPosPandemic = pd.DataFrame({'HotelID': pd.Series([], dtype='float'),
                                           '05/2021': pd.Series([], dtype='float'),
                                           '06/2021': pd.Series([], dtype='float'),
                                           '07/2021': pd.Series([], dtype='float'),
                                           '08/2021': pd.Series([], dtype='float'),
                                           '09/2021': pd.Series([], dtype='float'),
                                           '10/2021': pd.Series([], dtype='float'),
                                           '11/2021': pd.Series([], dtype='float'),
                                           '12/2021': pd.Series([], dtype='float'),
                                           '01/2022': pd.Series([], dtype='float'),
                                           '02/2022': pd.Series([], dtype='float'),
                                           '03/2022': pd.Series([], dtype='float'),
                                           '04/2022': pd.Series([], dtype='float'),
                                           '05/2022': pd.Series([], dtype='float')
                                          })

In [7]:
#array used to count total of rating per month
arrayPrePandemicCount = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
#array used to sum total of rating per month
arrayPrePandemicRating = [0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
#array used to get media of rating per month
arrayPrePandemicFinalRating = [0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

#array used to count total of rating per month
arrayPosPandemicCount = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
#array used to sum total of rating per month
arrayPosPandemicRating = [0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
#array used to get media of rating per month
arrayPosPandemicFinalRating = [0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

### Functions to use in the Main Loop

In [8]:
# Open page and read HTML
def openPageReadHTML(url):

    # Open Firefox with Selenium
    #binary = FirefoxBinary('/Applications/Firefox.app/Contents/MacOS/firefox-bin')  # in case of error, replace the Firexfox path with the one on your computer
    #browser = webdriver.Firefox(firefox_binary=binary, options=options)
    browser = webdriver.Firefox(options=options)
    browser.get(url)
    
    time.sleep(1) # Wait one second

    # If there is a privacy pop-up, click the OK button
    privacy_button = browser.find_elements(By.ID,"onetrust-accept-btn-handler")
    if len(privacy_button)>0:
        browser.execute_script("arguments[0].click()", privacy_button[0])
        time.sleep(0.5) # Wait half a second
            
    # Try to move into first review and click the button 
    # Some times it takes some time to load the page
    clicked_button=False
    while not clicked_button:
        read_more_buttons = browser.find_elements(By.CLASS_NAME,"eljVo")
        if len(read_more_buttons) > 0:
            try: 
                browser.execute_script("arguments[0].scrollIntoView(true);", read_more_buttons[0])
                browser.execute_script("arguments[0].click()", read_more_buttons[0])
                time.sleep(0.5) # Wait half a second
                clicked_button=True
            except:
                # Wait for one second to retry
                time.sleep(1)
        else:
            # Wait for one second to retry
            time.sleep(1)
    
    # Read the content close de browser
    html_source = browser.page_source  
    browser.quit()

    # Transform the html into a BeautifulSoup object
    soupObj = BeautifulSoup(html_source) 

    return soupObj

In [9]:
# Process each page
def processPage(soupObj, hotelID, extractedDF):

    # Read reviews
    reviews = soupObj.find_all("div", {"data-test-target": "reviews-tab"})
    
    # Get the list of reviews
    reviewsList = reviews[0].select("div[class*=cWwQK]")

    # Loop thru each review
    for i in range(0,len(reviewsList)):
        print("----- start processPage -----")

        # date 
        dateReview = str(reviewsList[i].select("span[class*=euPKI]"))
        dateReview = dateReview.replace('<span class="CrxzX">Data da estadia:</span>','')
        dateReview = dateReview.replace('<span class="euPKI _R Me S4 H3"> ','')
        dateReview = dateReview.replace('</span>','')
        dateReview = dateReview.replace('[','')
        dateReview = dateReview.replace(']','')
#        print("processPage | date = ", dateReview)

        if "2019" in dateReview or "2020" in dateReview or "2021" in dateReview or "2022" in dateReview:
            print("Accepted hotelID | ", hotelID, dateReview)
                
            # Get Rating
            r1 = reviewsList[i].select("span[class*=ui_bubble_rating]")
            r2 = r1[0]["class"][1]
            reviewRating = int(''.join(filter(str.isdigit, r2)))/10
    #        print("processPage | reviewRating = ", reviewRating)

            isDataAccepted = processDateInformation(hotelID, dateReview, reviewRating)
            print("isDataAccepted | ", isDataAccepted, dateReview)

            if isDataAccepted == 1:

                # Get User
                user = reviewsList[i].select("a[class*=ui_header_link]")[0].string
    #            print("processPage | user = ", user)

                # Get review text
                t = reviewsList[i].select("q[class*=XllAv]")[0]
                reviewText = t.get_text()        
    #            print("processPage | reviewText = ", reviewText)


                # local
                location = reviewsList[i].select("span[class*=fSiLz]")
                if len(location) > 0:
                    location = str(location[0]).partition("</span>")[2].replace("</span></span>","")
                else:
                    location = "no location"
    #            print("processPage | location = ", location)


                # Contributions
                contributions = soup.find_all("span", {"class": "ckXjS"})[0].string
    #            print("processPage | contributions = ", contributions)


                # Creating an empty dataframe
                df = pd.DataFrame([[hotelID, user, reviewRating, location, dateReview, contributions, reviewText]],
                           columns=['hotelID', 'user', 'rating', 'local','date','contributions', 'text'])



                # Update extracted reviews dataframe
                extractedDF = pd.concat([extractedDF, df])
            
        print("----- end processPage -----")
        

    # Return the resulting dataframe
    return extractedDF

In [10]:
def processDateInformation(hotelID, dateReview, rating):
    isPrePandemic = 0
    isPosPandemic = 0

#    print("---- start processDateInformation ----")
#    print("dateReview = ", dateReview, " | rating = ", rating)

    arrayPrePandemicCount[0] = hotelID
    arrayPrePandemicRating[0] = hotelID
    arrayPrePandemicFinalRating[0] = hotelID
    
    arrayPosPandemicCount[0] = hotelID
    arrayPosPandemicRating[0] = hotelID
    arrayPosPandemicFinalRating[0] = hotelID
        
    if "2019" in dateReview:
        isPrePandemic = 1
        isPosPandemic = 0
               
        if"fevereiro"  in dateReview:
            arrayPrePandemicCount[1] = arrayPrePandemicCount[1]+1
            arrayPrePandemicRating[1] = arrayPrePandemicRating[1]+rating
        elif"março" in dateReview:
            arrayPrePandemicCount[2] = arrayPrePandemicCount[2]+1
            arrayPrePandemicRating[2] = arrayPrePandemicRating[2]+rating
        elif"abril" in dateReview:
            arrayPrePandemicCount[3] = arrayPrePandemicCount[3]+1
            arrayPrePandemicRating[3] = arrayPrePandemicRating[3]+rating
        elif"maio" in dateReview:
            arrayPrePandemicCount[4] = arrayPrePandemicCount[4]+1
            arrayPrePandemicRating[4] = arrayPrePandemicRating[4]+rating
        elif"junho" in dateReview:
            arrayPrePandemicCount[5] = arrayPrePandemicCount[5]+1
            arrayPrePandemicRating[5] = arrayPrePandemicRating[5]+rating
        elif"julho" in dateReview:
            arrayPrePandemicCount[6] = arrayPrePandemicCount[6]+1
            arrayPrePandemicRating[6] = arrayPrePandemicRating[6]+rating
        elif"agosto" in dateReview:
            arrayPrePandemicCount[7] = arrayPrePandemicCount[7]+1
            arrayPrePandemicRating[7] = arrayPrePandemicRating[7]+rating
        elif"setembro" in dateReview:
            arrayPrePandemicCount[8] = arrayPrePandemicCount[8]+1
            arrayPrePandemicRating[8] = arrayPrePandemicRating[8]+rating
        elif"outubro" in dateReview:
            arrayPrePandemicCount[9] = arrayPrePandemicCount[9]+1
            arrayPrePandemicRating[9] = arrayPrePandemicRating[9]+rating
        elif"novembro" in dateReview:
            arrayPrePandemicCount[10] = arrayPrePandemicCount[10]+1
            arrayPrePandemicRating[10] = arrayPrePandemicRating[10]+rating
        elif"dezembro" in dateReview:
            arrayPrePandemicCount[11] = arrayPrePandemicCount[11]+1
            arrayPrePandemicRating[11] = arrayPrePandemicRating[11]+rating
    elif "2020" in dateReview:
        isPrePandemic = 1
        isPosPandemic = 0
        
        if"janeiro" in dateReview:
            arrayPrePandemicCount[12] = arrayPrePandemicCount[12]+1
            arrayPrePandemicRating[12] = arrayPrePandemicRating[12]+rating
        elif"fevereiro" in dateReview:
            arrayPrePandemicCount[13] = arrayPrePandemicCount[13]+1
            arrayPrePandemicRating[13] = arrayPrePandemicRating[13]+rating
            
            
            
    if "2021" in dateReview:
        isPrePandemic = 0
        isPosPandemic = 1
        
        if"maio" in dateReview:
            arrayPosPandemicCount[1] = arrayPosPandemicCount[1]+1
            arrayPosPandemicRating[1] = arrayPosPandemicRating[1]+rating
        elif"junho" in dateReview:
            arrayPosPandemicCount[2] = arrayPosPandemicCount[2]+1
            arrayPosPandemicRating[2] = arrayPosPandemicRating[2]+rating
        elif"julho" in dateReview:
            arrayPosPandemicCount[3] = arrayPosPandemicCount[3]+1
            arrayPosPandemicRating[3] = arrayPosPandemicRating[3]+rating
        elif"agosto" in dateReview:
            arrayPosPandemicCount[4] = arrayPosPandemicCount[4]+1
            arrayPosPandemicRating[4] = arrayPosPandemicRating[4]+rating
        elif"setembro" in dateReview:
            arrayPosPandemicCount[5] = arrayPosPandemicCount[5]+1
            arrayPosPandemicRating[5] = arrayPosPandemicRating[5]+rating
        elif"outubro" in dateReview:
            arrayPosPandemicCount[6] = arrayPosPandemicCount[6]+1
            arrayPosPandemicRating[6] = arrayPosPandemicRating[6]+rating
        elif"novembro" in dateReview:
            arrayPosPandemicCount[7] = arrayPosPandemicCount[7]+1
            arrayPosPandemicRating[7] = arrayPosPandemicRating[7]+rating
        elif"dezembro" in dateReview:
            arrayPosPandemicCount[8] = arrayPosPandemicCount[8]+1
            arrayPosPandemicRating[8] = arrayPosPandemicRating[8]+rating
    elif "2022" in dateReview:
        isPrePandemic = 0
        isPosPandemic = 1
        
        if"janeiro" in dateReview:
            arrayPosPandemicCount[9] = arrayPosPandemicCount[9]+1
            arrayPosPandemicRating[9] = arrayPosPandemicRating[9]+rating
        elif"fevereiro" in dateReview:
            arrayPosPandemicCount[10] = arrayPosPandemicCount[10]+1
            arrayPosPandemicRating[10] = arrayPosPandemicRating[10]+rating
        elif"março" in dateReview:
            arrayPosPandemicCount[11] = arrayPosPandemicCount[11]+1
            arrayPosPandemicRating[11] = arrayPosPandemicRating[11]+rating
        elif"abril" in dateReview:
            arrayPosPandemicCount[12] = arrayPosPandemicCount[12]+1
            arrayPosPandemicRating[12] = arrayPosPandemicRating[12]+rating
        elif"maio" in dateReview:
            arrayPosPandemicCount[13] = arrayPosPandemicCount[13]+1
            arrayPosPandemicRating[13] = arrayPosPandemicRating[13]+rating



    for x in range(14):
        # > 0 to avoid change hotelID
        if x>0:
            if arrayPrePandemicRating[x]>0:
                arrayPrePandemicFinalRating[x] = float(arrayPrePandemicRating[x]) / float(arrayPrePandemicCount[x])
            if arrayPosPandemicRating[x]>0:
                arrayPosPandemicFinalRating[x] = float(arrayPosPandemicRating[x]) / float(arrayPosPandemicCount[x])
        
    itsDataAccepted = 0
    if isPrePandemic == 1 or isPosPandemic == 1:
        itsDataAccepted = 1
        
    return itsDataAccepted

In [11]:
def saveInformationRatingPrePandemic():
    #PRE PANDEMIC INFORMATION
    df = pd.DataFrame([list(arrayPrePandemicCount), list(arrayPrePandemicFinalRating)],
               columns=['HotelID','02/2019', '03/2019', '04/2019', '05/2019', '06/2019', '07/2019', '08/2019', '09/2019', 
                        '10/2019', '11/2019', '12/2019', '01/2020', '02/2020'])
    
    df.loc[len(df)] = ['','', '', '', '', '', '', '', '', '', '', '', '', '']
    
    hotelStatisticsPrePandemicData = pd.concat([hotelStatisticsPrePandemic, df])
    return hotelStatisticsPrePandemicData

In [12]:
def saveInformationRatingPosPandemic():
    #POS PANDEMIC INFORMATION
    df = pd.DataFrame([list(arrayPosPandemicCount), list(arrayPosPandemicFinalRating)],
               columns=['HotelID','05/2021', '06/2021', '07/2021', '08/2021', '09/2021', '10/2021', '11/2021', '12/2021',
                        '01/2022', '02/2022', '03/2022', '04/2022', '05/2022'])

    df.loc[len(df)] = ['','', '', '', '', '', '', '', '', '', '', '', '', '']

    hotelStatisticsPosPandemicData = pd.concat([hotelStatisticsPosPandemic, df])
    return hotelStatisticsPosPandemicData

### Main loop

In [13]:
#Check total of reviews in the  URL
def getTotalAmountOfReviews(url):
    soupObj = openPageReadHTML(url)
    
    totalNumberReviews = 0
    getNumberReviewsToGetDynamicList = soupObj.select("li[class*=ui_radio]")
    for i in range(0,len(getNumberReviewsToGetDynamicList)):
        languageTitleString = str(getNumberReviewsToGetDynamicList[i].select("span[class*=fwSIg]"))
        if "Português" in languageTitleString:
            totalNumberReviews = str(getNumberReviewsToGetDynamicList[i].select("span[class*=cvxmR]"))
            totalNumberReviews = totalNumberReviews.replace('[<span class="cvxmR">(','')
            totalNumberReviews = totalNumberReviews.replace(')</span>]','')
    
    return int(totalNumberReviews)

In [None]:
# Loop for all hotels
print("Total of hotels on the list: ", hotelsToScrap.count())

for index, row in hotelsToScrap.iterrows():
    
    # Present feedback on which hotel is being processed
    print("-----")
    print("Processing hotel index | ", index)
    print("Processing hotel row | ", row)
    
#    print("wait 60sec")
#    time.sleep(60)
    
    #reset array per hotel
    arrayPrePandemicCount.clear()
    arrayPrePandemicRating.clear()
    arrayPrePandemicFinalRating.clear()

    arrayPrePandemicCount = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    arrayPrePandemicRating = [0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
    arrayPrePandemicFinalRating = [0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
    
    arrayPosPandemicCount.clear()
    arrayPosPandemicRating.clear()
    arrayPosPandemicFinalRating.clear()

    arrayPosPandemicCount = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    arrayPosPandemicRating = [0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
    arrayPosPandemicFinalRating = [0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

    # Hotel Reviews/Pages Counters -  Reset counter per hotel
    reviewsToGet = getTotalAmountOfReviews(row['URL'])
    #reviewsToGet = 10
    maxReviewsPerPage = 5
    reviewsExtracted = 0
    

    print("reviewsToGet | ", reviewsToGet)

    # Define URL to use based on the number of reviews extracted so far
    urlToUse = row['URL']

    # Loop until it extracts the pre-defined number of reviews
    while reviewsExtracted<reviewsToGet:
                
        if reviewsExtracted == 0:
            #original URL
            urlToUse = urlToUse
        else:
            #new page URL
            indexUrl = urlToUse.find("-Reviews-")
            preUrl = urlToUse[:indexUrl]
            posUrl = urlToUse[indexUrl:].replace("-Reviews-","-Reviews-or"+str(reviewsExtracted)+"-")
            urlToUse = preUrl + "" + posUrl
                    
        # Open and read the web page content
        soup = openPageReadHTML(urlToUse)            
            
        # Process web page
        hotelReviews = processPage(soup, index, hotelReviews)
        
        # Update counter && Present feedback on the number of extracted reviews
        reviewsExtracted = reviewsExtracted + maxReviewsPerPage
        
        if reviewsExtracted >= reviewsToGet:
            sheetName = "hotel"+str(index)
            hotelStatisticsPrePandemic = saveInformationRatingPrePandemic()
            hotelStatisticsPosPandemic = saveInformationRatingPosPandemic()
            hotelStatisticsPrePandemic.to_excel("StatisticPrePandemic.xlsx", sheetName)
            hotelStatisticsPosPandemic.to_excel("StatisticPosPandemic.xlsx", sheetName)
            
        print("Extracted ",reviewsExtracted,"/",reviewsToGet)
        
        
    print("-----")
     

Total of hotels on the list:  URL    10
dtype: int64
-----
Processing hotel index |  1
Processing hotel row |  URL    https://www.tripadvisor.pt/Hotel_Review-g18916...
Name: 1, dtype: object
reviewsToGet |  21
----- start processPage -----
Accepted hotelID |  1 junho de 2022
isDataAccepted |  1 junho de 2022
----- end processPage -----
----- start processPage -----
Accepted hotelID |  1 junho de 2021
isDataAccepted |  1 junho de 2021
----- end processPage -----
----- start processPage -----
Accepted hotelID |  1 junho de 2021
isDataAccepted |  1 junho de 2021
----- end processPage -----
----- start processPage -----
Accepted hotelID |  1 junho de 2021
isDataAccepted |  1 junho de 2021
----- end processPage -----
----- start processPage -----
Accepted hotelID |  1 junho de 2021
isDataAccepted |  1 junho de 2021
----- end processPage -----
Extracted  5 / 21
----- start processPage -----
Accepted hotelID |  1 abril de 2021
isDataAccepted |  1 abril de 2021
----- end processPage -----
----

----- start processPage -----
Accepted hotelID |  2 dezembro de 2020
isDataAccepted |  1 dezembro de 2020
----- end processPage -----
----- start processPage -----
Accepted hotelID |  2 dezembro de 2020
isDataAccepted |  1 dezembro de 2020
----- end processPage -----
----- start processPage -----
Accepted hotelID |  2 novembro de 2020
isDataAccepted |  1 novembro de 2020
----- end processPage -----
----- start processPage -----
Accepted hotelID |  2 novembro de 2020
isDataAccepted |  1 novembro de 2020
----- end processPage -----
----- start processPage -----
Accepted hotelID |  2 novembro de 2020
isDataAccepted |  1 novembro de 2020
----- end processPage -----
Extracted  45 / 202
----- start processPage -----
Accepted hotelID |  2 outubro de 2020
isDataAccepted |  1 outubro de 2020
----- end processPage -----
----- start processPage -----
Accepted hotelID |  2 outubro de 2020
isDataAccepted |  1 outubro de 2020
----- end processPage -----
----- start processPage -----
Accepted hotelID

----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
Extracted  125 / 202
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
Extracted  130 / 202
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
Extracted  135 / 202
----- start processPage -----
----- end processPage -----
----- sta

----- start processPage -----
Accepted hotelID |  4 maio de 2021
isDataAccepted |  1 maio de 2021
----- end processPage -----
----- start processPage -----
Accepted hotelID |  4 abril de 2021
isDataAccepted |  1 abril de 2021
----- end processPage -----
----- start processPage -----
Accepted hotelID |  4 abril de 2021
isDataAccepted |  1 abril de 2021
----- end processPage -----
----- start processPage -----
Accepted hotelID |  4 abril de 2021
isDataAccepted |  1 abril de 2021
----- end processPage -----
----- start processPage -----
Accepted hotelID |  4 março de 2021
isDataAccepted |  1 março de 2021
----- end processPage -----
Extracted  20 / 145
----- start processPage -----
Accepted hotelID |  4 março de 2021
isDataAccepted |  1 março de 2021
----- end processPage -----
----- start processPage -----
Accepted hotelID |  4 março de 2021
isDataAccepted |  1 março de 2021
----- end processPage -----
----- start processPage -----
Accepted hotelID |  4 março de 2021
isDataAccepted |  1 

----- start processPage -----
Accepted hotelID |  4 julho de 2020
isDataAccepted |  1 julho de 2020
----- end processPage -----
----- start processPage -----
Accepted hotelID |  4 junho de 2020
isDataAccepted |  1 junho de 2020
----- end processPage -----
----- start processPage -----
Accepted hotelID |  4 fevereiro de 2020
isDataAccepted |  1 fevereiro de 2020
----- end processPage -----
----- start processPage -----
Accepted hotelID |  4 fevereiro de 2020
isDataAccepted |  1 fevereiro de 2020
----- end processPage -----
----- start processPage -----
Accepted hotelID |  4 setembro de 2019
isDataAccepted |  1 setembro de 2019
----- end processPage -----
Extracted  85 / 145
----- start processPage -----
Accepted hotelID |  4 agosto de 2019
isDataAccepted |  1 agosto de 2019
----- end processPage -----
----- start processPage -----
Accepted hotelID |  4 março de 2019
isDataAccepted |  1 março de 2019
----- end processPage -----
----- start processPage -----
Accepted hotelID |  4 março de

----- start processPage -----
Accepted hotelID |  5 maio de 2021
isDataAccepted |  1 maio de 2021
----- end processPage -----
----- start processPage -----
Accepted hotelID |  5 abril de 2021
isDataAccepted |  1 abril de 2021
----- end processPage -----
----- start processPage -----
Accepted hotelID |  5 maio de 2021
isDataAccepted |  1 maio de 2021
----- end processPage -----
----- start processPage -----
Accepted hotelID |  5 maio de 2021
isDataAccepted |  1 maio de 2021
----- end processPage -----
----- start processPage -----
Accepted hotelID |  5 abril de 2021
isDataAccepted |  1 abril de 2021
----- end processPage -----
Extracted  30 / 145
----- start processPage -----
Accepted hotelID |  5 abril de 2021
isDataAccepted |  1 abril de 2021
----- end processPage -----
----- start processPage -----
Accepted hotelID |  5 abril de 2021
isDataAccepted |  1 abril de 2021
----- end processPage -----
----- start processPage -----
Accepted hotelID |  5 março de 2021
isDataAccepted |  1 març

----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
Extracted  95 / 145
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
Extracted  100 / 145
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
Extracted  105 / 145
----- start processPage -----
----- end processPage -----
----- star

----- start processPage -----
Accepted hotelID |  7 março de 2020
isDataAccepted |  1 março de 2020
----- end processPage -----
----- start processPage -----
Accepted hotelID |  7 fevereiro de 2020
isDataAccepted |  1 fevereiro de 2020
----- end processPage -----
----- start processPage -----
Accepted hotelID |  7 março de 2020
isDataAccepted |  1 março de 2020
----- end processPage -----
----- start processPage -----
Accepted hotelID |  7 março de 2020
isDataAccepted |  1 março de 2020
----- end processPage -----
----- start processPage -----
Accepted hotelID |  7 fevereiro de 2020
isDataAccepted |  1 fevereiro de 2020
----- end processPage -----
Extracted  30 / 256
----- start processPage -----
Accepted hotelID |  7 janeiro de 2020
isDataAccepted |  1 janeiro de 2020
----- end processPage -----
----- start processPage -----
Accepted hotelID |  7 janeiro de 2020
isDataAccepted |  1 janeiro de 2020
----- end processPage -----
----- start processPage -----
Accepted hotelID |  7 janeiro 

----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
Extracted  135 / 256
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
Extracted  140 / 256
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
Extracted  145 / 256
----- start processPage -----
----- end processPage -----
----- sta

reviewsToGet |  35
----- start processPage -----
Accepted hotelID |  8 junho de 2022
isDataAccepted |  1 junho de 2022
----- end processPage -----
----- start processPage -----
Accepted hotelID |  8 agosto de 2021
isDataAccepted |  1 agosto de 2021
----- end processPage -----
----- start processPage -----
Accepted hotelID |  8 agosto de 2020
isDataAccepted |  1 agosto de 2020
----- end processPage -----
----- start processPage -----
Accepted hotelID |  8 julho de 2020
isDataAccepted |  1 julho de 2020
----- end processPage -----
----- start processPage -----
Accepted hotelID |  8 novembro de 2019
isDataAccepted |  1 novembro de 2019
----- end processPage -----
Extracted  5 / 35
----- start processPage -----
Accepted hotelID |  8 julho de 2019
isDataAccepted |  1 julho de 2019
----- end processPage -----
----- start processPage -----
Accepted hotelID |  8 março de 2019
isDataAccepted |  1 março de 2019
----- end processPage -----
----- start processPage -----
----- end processPage -----

----- start processPage -----
----- end processPage -----
----- start processPage -----
Accepted hotelID |  10 janeiro de 2019
isDataAccepted |  1 janeiro de 2019
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
Extracted  30 / 72
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
Extracted  35 / 72
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
----- start processPage -----
----- end processPage -----
Extracted  40 / 7

In [None]:


# Save the extracted reviews data frame to an Excel file
hotelReviews.to_excel("ExtractedReviews.xlsx")