# Scrape Reviews from Agoda

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from time import sleep, time

### Create empty lists to store scrapped data

In [None]:
review_body = []
review_title = []
trip_type = [] 
review_date = []
stay_date = []
nationality = []

### Create url variable which is the first page of interest

In [None]:
url = 'https://www.agoda.com/en-sg/mandarin-orchard-singapore/hotel/singapore-sg.html?cid=-218'

### Create function which does the scraping. It takes the url given and the maximum number of pages to scrape as arguments and populates scrapped data into the empty lists created earlier

In [None]:
def scraper_agoda(url, max_pages):
    
    start_time = time()
    
    page = 1
    
    
    # OPENS BROWSER
    driver = webdriver.Chrome()
    
    # OPENS FIRST PAGE
    driver.get(url)
    
    sleep(5)
    
    # before scraping starts, we need to close the window that prompts the input of dates
    close = driver.find_elements_by_xpath("//a[@class='AlertMessage__close CalendarAlertMessage__close']")
    close[0].click()
    
    # ADJUST THE MAX NUMBER OF PAGES TO SCRAPE
    while page <= max_pages:


        # GET INNER HTML
        soup = BeautifulSoup(driver.page_source, "lxml")


        # CONTAINER IN PAGE
        containers = soup.findAll('div', {'class': 'Review-comment'})



        # LOOPING THROUGH EACH OF THE CONTAINERS (each represents ONE user's review)
        for container in containers:


            # STAY DATE (if empty append None, else append get_text())
            stay_date_item = container.findAll('span')[2]
            if stay_date_item != None:
                stay_date.append(stay_date_item.get_text())
            else:
                stay_date.append(None)



            # TRIP TYPE (if empty append None, else append get_text())
            trip_type_item = container.find('span')
            if trip_type_item != None:
                trip_type.append(trip_type_item.get_text())
            else:
                trip_type.append(None)




            # REVIEWER NATIONALITY (if empty append None, else append get_text())
            nationality_item = container.find('div', {'class': 'Review-comment-reviewer'})
            if nationality_item != None:
                nationality.append(nationality_item.get_text())
            else:
                nationality.append(None)



            # REVIEW TITLE (if empty append None, else append get_text())
            review_title_item = container.find('p', {'class': 'Review-comment-bodyTitle'})
            if review_title_item != None:
                review_title.append(review_title_item.get_text())
            else:
                review_title.append(None)



            # REVIEW BODY (if empty append None, else append get_text())
            review_body_item = container.find('p', {'class': 'Review-comment-bodyText'})
            if review_body_item != None:
                review_body.append(review_body_item.get_text())
            else:
                review_body.append(None)



            # REVIEW DATE (if empty append None, else append get_text())
            review_date_item = container.find('span', {'class': 'Review-statusBar-date '})
            if review_date_item != None:
                review_date.append(review_date_item.get_text())
            else:
                review_date.append(None)



        # increase page by 1
        page = page + 1


        # toggle to next page

        next_page_button = driver.find_elements_by_xpath("//i[@class='ficon ficon-24 ficon-carrouselarrow-right']")
        next_page_button[0].click()
        sleep(5)


    driver.quit()

    total_elapsed_time = time() - start_time
    print('Total Run Time:', total_elapsed_time/60, 'min')

### Check scrapped data

In [None]:
print('rows of stay_date',len(stay_date))
print('rows of trip_type',len(trip_type))
print('rows of review_date',len(review_date))
print('rows of review_body',len(review_body))
print('rows of review_title',len(review_title))
print('rows of nationality',len(review_title))

### Put into Dataframe

In [None]:
# put all into dataframe
zippedList =  list(zip(stay_date, trip_type, review_date, review_body, review_title, nationality))

reviews = pd.DataFrame(zippedList, columns = ['date_of_stay' , 'trip_type', 'date_of_review',\
                                              'review', 'review_title', 'nationality'])

print(reviews.shape)

### Save to csv (initial scrape)

In [None]:
reviews.to_csv('./Data/MOH_AGODA_REVIEWS.csv', index=False)

### Append to csv (subsequent scrape)

In [None]:
reviews.to_csv('./Data/MOH_AGODA_REVIEWS.csv', index=False, mode='a', header=False)