In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd

In [2]:
# use User-Agent information to access and retrieve page data
headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}

In [3]:
names = []
nums = []
ratings = []
hours = []
prices = []
categories = []

# dictionary to store extracted data    
dat_dict = {'Tour Name': names,
    'Type': categories,
    'Rating': ratings,
    'Number of reviews': nums,
    'Price': prices,
    'Duration': hours}

In [4]:
# arg: url address 
# return: soup object
def get_page(url):
    r = requests.get(url, headers = headers) # retrieve data from the input url
    soup = BeautifulSoup(r.text, 'html.parser') # create soup object to parse html content
    return soup

In [5]:
# arg: soup object returned from get_page function
# return: updated dat_dict that stores scraped data
def get_data(soup):
    # tour name
    for name in soup.find_all('div', {'class':'XfVdV o AIbhI'}):
        names.append(name.text.strip()) 

    # review number
    for num in soup.find_all('div', {'class': 'jVDab o W f u w JqMhy'}):
        nums.append(num.text.strip())

    # review rating
    for rating in soup.find_all('svg', {'class': 'UctUV d H0 hzzSG'}):
        label = rating.get('aria-label')
        ratings.append(label)

    # tour duration
    for hour in soup.find_all('div', {'class': 'bRMrl _Y K fOSqw'}):
        hours.append(hour.text.strip()) 

    # tour price
    for price in soup.find_all('div', {'class': 'biGQs _P fiohW avBIb ngXxk'}):
        prices.append(price.text.strip())

    # tour category
    for cat in soup.find_all('div', {'class': 'alPVI eNNhq PgLKC tnGGX yzLvM'}):
        # categories text is nested inside the alPVI tag, and there are other items (outside of categories) with same tags
        # used recursive = False to only consider the direct children of alPVI div
        element = cat.find('div', {'class': 'biGQs _P pZUbB hmDzD'}, recursive = False)
        if element:
            categories.append(element.text.strip())
    
    return dat_dict

In [6]:
# extract data from the main search page of Tokyo tours
url = "https://www.tripadvisor.com/Attractions-g298184-Activities-c42-Tokyo_Tokyo_Prefecture_Kanto.html"
soup = get_page(url)
page_data = get_data(soup)

In [7]:
# extract data for the next 34 search pages
root = "https://www.tripadvisor.com/Attractions-g298184-Activities-c42-oa"
sub = "-Tokyo_Tokyo_Prefecture_Kanto.html"

for page in range(1,35):
    # pattern for search pages follows oa30, oa60, oa90, and so on
    url = root + str(30 * page) + sub
    soup = get_page(url)
    page_data = get_data(soup)

In [8]:
# handling columns with different lengths 
df = pd.DataFrame.from_dict(page_data, orient = 'index') 
df = df.transpose() # transpose to get the wanted format

In [9]:
df.head(10)

Unnamed: 0,Tour Name,Type,Rating,Number of reviews,Price,Duration,url
0,1. 1 Day Private Mt Fuji Tour (Charter) - Engl...,Cable Car Tours,5.0 of 5 bubbles,159,$473.38,6+ hours,
1,2. Mt Fuji and Hakone 1-Day Bus Tour return by...,Audio Guides,5.0 of 5 bubbles,1292,$141.09,6+ hours,
2,3. Tokyo Full-Day Private Tour with Government...,Public Transportation Tours,5.0 of 5 bubbles,571,$152.04,6 hours,
3,"4. Mt Fuji, Hakone Lake Ashi Cruise Bullet Tra...",Bus Tours,4.0 of 5 bubbles,652,$133.94,6+ hours,
4,5. Private Full Day Sightseeing Tour to Mount ...,Bus Tours,5.0 of 5 bubbles,286,$647.98,6+ hours,
5,6. Kart experience in Shinjuku drive metropoli...,4WD Tours,4.5 of 5 bubbles,125,$137.56,1–2 hours,
6,7. 1-Day Tokyo Bus Tour,Audio Guides,5.0 of 5 bubbles,1669,$111.39,9–10 hours,
7,8. DAIKOKU PA Tokyo Drift CAR MEET,Luxury Car Tours,5.0 of 5 bubbles,13,$144.80,5–6 hours,
8,9. Tokyo Bar Hopping Night Tour in Shinjuku,Spring Break,5.0 of 5 bubbles,541,$112.22,3 hours,
9,"10. Eat and Drink Like a LOCAL : Restaurant, T...",Food & Drink,5.0 of 5 bubbles,412,$119.46,3 hours,


In [10]:
# Save as CSV file
df.to_csv('tour.csv', index = False, header = True)