# Scrape Reviews from TripAdvisor

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from time import sleep, time

### Create empty lists to store scrapped data

In [None]:
review_body = []
review_title = []
trip_type = [] 
review_date = []
stay_date = []
hotel_reply = []

### Create url variable which is the first page of interest

In [None]:
url = 'https://www.tripadvisor.com.sg/Hotel_Review-g294265-d300850-Reviews-or10000-Mandarin_Orchard_Singapore-Singapore.html'

### Create function which does the scraping. It takes the url given and the maximum number of pages to scrape as arguments and populates scrapped data into the empty lists created earlier

In [None]:
def scraper_tripadvisor(url, max_pages):

    start_time = time()


    page = 1


    # OPEN BROWSER
    driver = webdriver.Chrome()


    # OPEN UP FIRST PAGE
    driver.get(url)


    sleep(5)


    # ADJUST THE MAX NUMBER OF PAGES TO SCRAPE
    while page <= max_pages:



        # CLICK ON READ MORE
        read_more = driver.find_elements_by_xpath("//div/div[@class='hotels-review-list-parts-ExpandableReview__ctaLine--1lhYi']")

        for more in read_more:
            try:
                more.click()
                driver.implicitly_wait(5)
            except:
                pass



        # GET INNER HTML
        soup = BeautifulSoup(driver.page_source, "lxml")


        # CONTAINER IN PAGE
        containers = soup.findAll('div', {'class': 'hotels-review-list-parts-SingleReview__reviewContainer--d54T4'})



        # LOOPING THROUGH EACH OF THE CONTAINERS (each represents ONE user's review)
        for container in containers:



            # STAY DATE (if empty append None, else append get_text())
            stay_date_item = container.find('div', {'class': 'hotels-review-list-parts-EventDate__event_date--CRXs4'})
            if stay_date_item != None:
                stay_date.append(stay_date_item.get_text())
            else:
                stay_date.append(None)



            # TRIP TYPE (if empty append None, else append get_text())
            trip_type_item = container.find('div', {'class': 'hotels-review-list-parts-TripType__trip_type--2cnp7'})
            if trip_type_item != None:
                trip_type.append(trip_type_item.get_text())
            else:
                trip_type.append(None)



            # REVIEW TITLE (if empty append None, else append get_text())
            review_title_item = container.find('a', {'class': 'hotels-review-list-parts-ReviewTitle__reviewTitleText--3QrTy'})
            if review_title_item != None:
                review_title.append(review_title_item.get_text())
            else:
                review_title.append(None)



            # REVIEW BODY (if empty append None, else append get_text())
            review_body_item = container.find('q', {'class': 'hotels-review-list-parts-ExpandableReview__reviewText--3oMkH'})
            if review_body_item != None:
                review_body.append(review_body_item.get_text())
            else:
                review_body.append(None)



            # REVIEW DATE (if empty append None, else append get_text())
            review_date_item = container.find('div', {'class': 'social-member-event-MemberEventOnObjectBlock__event_type--3njyv'})
            if review_date_item != None:
                review_date.append(review_date_item.get_text())
            else:
                review_date.append(None)



            # HOTEL RESPONSE (yes/no)
            hotel_reply_item = container.find('span', {'class': 'hotels-review-list-parts-OwnerResponse__reviewText--28Wat'})
            if hotel_reply_item != None:
                hotel_reply.append(hotel_reply_item.find('span').text)
            else:
                hotel_reply.append(None)




        # increase page by 1
        page = page + 1


        # toggle to next page
        next_page_button = driver.find_elements_by_xpath("//a[@class='ui_button nav next primary ']")[0]
        next_page_button.click()
        driver.implicitly_wait(5)



    driver.quit()

    total_elapsed_time = time() - start_time
    print('Total Run Time:', total_elapsed_time/60, 'min')

### Check scrapped data

In [None]:
print('rows of stay_date',len(stay_date))
print('rows of trip_type',len(trip_type))
print('rows of review_date',len(review_date))
print('rows of review_body',len(review_body))
print('rows of review_title',len(review_title))
print('rows of hotel_reply',len(hotel_reply))

### Put into Dataframe

In [None]:
zippedList =  list(zip(stay_date, trip_type, review_date, review_body, review_title, hotel_reply))

reviews = pd.DataFrame(zippedList, columns = ['date_of_stay' , 'trip_type', 'date_of_review',\
                                              'review', 'review_title', 'hotel_response'])
print(reviews.shape)
reviews.head()

### Save to csv (initial scrape)

In [None]:
reviews.to_csv('./Data/MOH_TA_REVIEWS.csv', index=False)

### Append to csv (subsequent scrape)

In [None]:
reviews.to_csv('./Data/MOH_TA_REVIEWS.csv', index=False, mode='a', header=False)