# Trip Advisor Web Scraper

## Requirements:
- Selenium
- BeautifulSoup
- Time

In [None]:
import csv
import time
from bs4 import BeautifulSoup

In [None]:
# Firefox and Chrome
from selenium import webdriver

## Startup the webdriver

In [None]:
driver = webdriver.Chrome()

In [None]:
url = 'https://www.tripadvisor.com/Attraction_Review-g608497-d1515658-Reviews-Tegalalang_Rice_Terrace-Tegalalang_Gianyar_Regency_Bali.html'
driver.get(url)

## Extract the collection

In [None]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [None]:
parent = soup.find('div', {'class': 'bPhtn'})

## Prototype the record

In [None]:
results = parent.find_all('span', {'data-ft': 'true'})

In [None]:
del results[0::2]

In [None]:
item = results[8]

In [None]:
name = item.find('span', {'class': 'WlYyy cPsXC dTqpp'}).text

In [None]:
cityParent = item.find('div', {'class': 'WlYyy diXIH bQCoY'})

In [None]:
city = cityParent.find('span').text

In [None]:
history = item.find('div', {'class': 'fEDvV'}).text

In [None]:
date = history[:8]

In [None]:
review = item.find('div', {'class': 'WlYyy diXIH dDKKM'}).text

## Generalise the pattern

In [None]:
def extract_record(item):
    """Extract and return data from a single record"""
    
    # name
    try:
        name = item.find('span', {'class': 'WlYyy cPsXC dTqpp'}).text
    except AttributeError:
        return
    
    # city
    try:
        cityParent = item.find('div', {'class': 'WlYyy diXIH bQCoY'})
        city = cityParent.find('span').text
    except AttributeError:
        city = 'NaN'
    
    if 'contribution' in city:
        city = 'NaN'
    
    # date
    try:
        history = item.find('div', {'class': 'fEDvV'}).text
        date = history[:8]
    except AttributeError:
        date = 'NaN'
    
    # review
    try:
        review = item.find('div', {'class': 'WlYyy diXIH dDKKM'}).text
    except AttributeError:
        return
    
    return (name, review, city, date)

In [None]:
records = []

parent = soup.find('div', {'class': 'bPhtn'})
results = parent.find_all('span', {'data-ft': 'true'})
del results[0::2]

for item in results:
    record = extract_record(item)
    if record:
        records.append(record)

In [None]:
for row in records:
    print(row[3])

## Getting the next page

In [None]:
def get_url(page):
    """Generate a url from page given"""
    url = 'https://www.tripadvisor.com/Attraction_Review-g608497-d1515658-Reviews-or{}-Tegalalang_Rice_Terrace-Tegalalang_Gianyar_Regency_Bali.html'
    return url.format(page * 10)

## Putting all together

In [None]:
import csv
from bs4 import BeautifulSoup

# Firefox and Chrome
from selenium import webdriver


def get_url(page):
    """Generate a url from search term"""
    url = 'https://www.tripadvisor.com/Attraction_Review-g608497-d1515658-Reviews-or{}-Tegalalang_Rice_Terrace-Tegalalang_Gianyar_Regency_Bali.html'
    return url.format(page * 10)

def extract_record(item):
    """Extract and return data from a single record"""
    
    # name
    try:
        name = item.find('span', {'class': 'WlYyy cPsXC dTqpp'}).text
    except AttributeError:
        return
    
    # city
    try:
        cityParent = item.find('div', {'class': 'WlYyy diXIH bQCoY'})
        city = cityParent.find('span').text
    except AttributeError:
        city = 'NaN'
    
    if 'contribution' in city:
        city = 'NaN'
    
    # date
    try:
        history = item.find('div', {'class': 'fEDvV'}).text
        date = history[:8]
    except AttributeError:
        date = 'NaN'
    
    # review
    try:
        review = item.find('div', {'class': 'WlYyy diXIH dDKKM'}).text
    except AttributeError:
        return
    
    return (name, review, city, date)

def main(pages):
    """Run main program routine"""
    
    # start the webdriver
    driver = webdriver.Chrome()
    
    records = []
    
    for page in range(pages):
        url = get_url(page)
        driver.get(url)
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        parent = soup.find('div', {'class': 'bPhtn'})
        results = parent.find_all('span', {'data-ft': 'true'})
        del results[0::2]

        for item in results:
            record = extract_record(item)
            if record:
                records.append(record)
        
    driver.close()

    # save data to csv file
    with open('tripadvisor.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Name', 'Review', 'Origin', 'Date'])
        writer.writerows(records)

In [None]:
main(5)

## Page navigation

In [None]:
import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import WebDriverException

In [None]:
driver = webdriver.Chrome()

In [None]:
url = 'https://www.tripadvisor.com/Attraction_Review-g608497-d1515658-Reviews-Tegalalang_Rice_Terrace-Tegalalang_Gianyar_Regency_Bali.html'

In [None]:
driver.get(url)

In [None]:
filters = driver.find_element_by_xpath('//span[@class = "WlYyy CETAK"]')

In [None]:
filters.click()

In [None]:
ratings = driver.find_elements_by_xpath('//button[@class="bHgte z Pc PQ Pp PD W _S Gn Z B2 BF Cj _M cbSHg eVjae fksET bxeeW ddFHE"][@type="button"][@aria-label="Select Filter"]/div[@class="vsqao k u"]')

In [None]:
for i in range(3, 5):
    try:
        ratings[i].click()

    except WebDriverException:
        print('element is not clickable', i)

In [None]:
try:
    apply = driver.find_element_by_xpath('//button[@class="fGwNR _G B- z _S c Wc ddFHE ezIjy brHeh"][@type="button"]/span[@class="WlYyy bcUBw"]')
    apply.click()

except WebDriverException:
    print('wrong apply')

In [None]:
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')

In [None]:
try:
#     nextPage = driver.find_element_by_xpath('//div[@class="eRhUG"]/a[@class="dfuux f u j _T z _F _S ddFHE bVTsJ emPJr"][@aria-label="Next page"]')
    nextPage = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//a[@aria-label="Next page"]'))
    )
#     nextPage.click()
    
    driver.execute_script("arguments[0].scrollIntoView();", nextPage)
    driver.execute_script("arguments[0].click();", nextPage)
    
except Exception as e:
    print(str(e))
    print('cant go to next page')

In [None]:
button = driver.find_element_by_xpath('//div[@class="cCnaz"]')
driver.execute_script("arguments[0].click();", button)

## Adding filter
Adding filter to rating.

In [None]:
import csv
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Firefox and Chrome
from selenium import webdriver

# webdriver exception
from selenium.common.exceptions import WebDriverException

def get_url(page):
    """Generate a url from search term"""
    url = 'https://www.tripadvisor.com/Attraction_Review-g608497-d1515658-Reviews-or{}-Tegalalang_Rice_Terrace-Tegalalang_Gianyar_Regency_Bali.html'
    return url.format(page * 10)

def extract_record(item):
    """Extract and return data from a single record"""
    
    # name
    try:
        name = item.find('span', {'class': 'WlYyy cPsXC dTqpp'}).text
    except AttributeError:
        return
    
    # city
    try:
        cityParent = item.find('div', {'class': 'WlYyy diXIH bQCoY'})
        city = cityParent.find('span').text
    except AttributeError:
        city = 'NaN'
    
    if 'contribution' in city:
        city = 'NaN'
    
    # date
    try:
        history = item.find('div', {'class': 'fEDvV'}).text
        date = history[:8]
    except AttributeError:
        date = 'NaN'
    
    # review
    try:
        review = item.find('div', {'class': 'WlYyy diXIH dDKKM'}).text
    except AttributeError:
        return
    
    return (name, review, city, date)

def main(pages):
    """Run main program routine"""
    
    # start the webdriver
    driver = webdriver.Chrome()
    
    records = []
    
    for page in range(pages):
        url = get_url(page)
        driver.get(url)
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        parent = soup.find('div', {'class': 'bPhtn'})
        results = parent.find_all('span', {'data-ft': 'true'})
        del results[0::2]
                
        for item in results:
            record = extract_record(item)
            if record:
                records.append(record)
        
    driver.close()

    # save data to csv file
    with open('tripadvisor1.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Name', 'Review', 'Origin', 'Date'])
        writer.writerows(records)

In [None]:
main(10)

## Final scraper

In [None]:
import csv
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Firefox and Chrome
from selenium import webdriver

# webdriver exception
from selenium.common.exceptions import WebDriverException

def extract_record(item):
    """Extract and return data from a single record"""
    
    # name
    try:
        name = item.find('span', {'class': 'WlYyy cPsXC dTqpp'}).text
    except AttributeError:
        return
    
    # city
    try:
        cityParent = item.find('div', {'class': 'WlYyy diXIH bQCoY'})
        city = cityParent.find('span').text
    except AttributeError:
        city = 'NaN'
    
    if 'contribution' in city:
        city = 'NaN'
    
    # date
    try:
        history = item.find('div', {'class': 'fEDvV'}).text
        date = history[:8]
    except AttributeError:
        date = 'NaN'
    
    # review
    try:
        review = item.find('div', {'class': 'WlYyy diXIH dDKKM'}).text
    except AttributeError:
        return
    
    return (name, review, city, date)

def main(pages):
    """Run main program routine"""
    
    # start the webdriver
    driver = webdriver.Chrome()
    
    records = []
    url = 'https://www.tripadvisor.com/Attraction_Review-g608497-d1515658-Reviews-Tegalalang_Rice_Terrace-Tegalalang_Gianyar_Regency_Bali.html'
    driver.get(url)
    
    for page in range(pages):
        if page == 0:
            try:
                filters = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, '//span[@class = "WlYyy CETAK"]'))
                )
                filters.click()
            except WebDriverException:
                print('Couldn\'t filter reviews')
                driver.quit()
                
            try:
                ratings = WebDriverWait(driver, 10).until(
                    EC.presence_of_all_elements_located((By.XPATH, '//button[@class="bHgte z Pc PQ Pp PD W _S Gn Z B2 BF Cj _M cbSHg eVjae fksET bxeeW ddFHE"][@type="button"][@aria-label="Select Filter"]/div[@class="vsqao k u"]'))
                )
                
                for i in range(3, 5):
                    try:
                        ratings[i].click()
                    except WebDriverException:
                        print('rating could\'nt be selected', i)
                        driver.quit()
            except WebDriverException:
                print('Couldn\'t select all reviews')
                driver.quit()
            
            try:
                apply = driver.find_element_by_xpath('//button[@class="fGwNR _G B- z _S c Wc ddFHE ezIjy brHeh"][@type="button"]/span[@class="WlYyy bcUBw"]')
                apply.click()
            except WebDriverException:
                print('could not apply')
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        parent = soup.find('div', {'class': 'bPhtn'})
        results = parent.find_all('span', {'data-ft': 'true'})
        del results[0::2]
        
        for item in results:
            record = extract_record(item)
            if record:
                records.append(record)
        
        if page < pages - 1:
            try:
                nextPage = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, '//a[@aria-label="Next page"]'))
                )

                driver.execute_script("arguments[0].scrollIntoView();", nextPage)
                driver.execute_script("arguments[0].click();", nextPage)
            except Exception as e:
                print('Couldn\'t go to the next page')
                print(str(e))
        
        
    driver.close()

    # save data to csv file
    with open('tripadvisor.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Name', 'Review', 'Origin', 'Date'])
        writer.writerows(records)

In [None]:
main(5)