# TripAdvisor Restaurants

Kevin Jiahua Du <br />
23 June, 2020

In [1]:
import datetime
print('Last Updated: ' + str(datetime.datetime.now()))

Last Updated: 2020-06-26 00:23:13.160389


### Introduction

This notebook shows how to collect restaurant information in Victoria, Australia from TripAdvisor. The code can be modified to collect data from other areas.

In [2]:
from bs4 import BeautifulSoup
import requests
import re
import json
import ast
import time
import random
import pandas as pd
from tqdm import tqdm
import pickle
from multiprocessing import Pool, Manager
import os
import glob
import tarfile

pd.set_option('display.max_columns', None)

PH = '_'
TA_BASE = 'https://www.tripadvisor.com.au'
headers = {'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1'}

### Step 1
The task starts with gathering region information in TripAdvisor.

In [None]:
region_dict = {}

init_url = '/Restaurants-g255098-Victoria.html'
MAX_PAGE = 99999
for page_num in range(MAX_PAGE):
    if page_num > 0 and next_url is None:
        break
    link = init_url if page_num == 0 else next_url
    response = requests.get(TA_BASE + link, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    time.sleep(random.uniform(0, 2))

    # get next page
    next_url = None
    buttons = soup.findAll('div', {'class': 'button'})
    for button in buttons:
        button = button.find('a')
        if button.text.strip() == 'See More':
            next_url = button['onclick']
            next_url = re.search('/MobileRestaurantSearch.+\.html', next_url).group(0)

    for entity in soup.findAll('li', {'class': 'snippetContainer'}):
        e_name = entity.find('div', {'class': 'title'}).text.strip()
        e_url = entity.find('a')['href']
        region_dict[e_name] = e_url

with open('regions.pickle', 'wb') as fp:
    pickle.dump(dict(region_dict), fp)
    
print(len(region_dict))

### Step2
Restaurants within each region is then collected.

In [None]:
with open('regions.pickle', 'rb') as fp:
    region_dict = pickle.load(fp)

In [None]:
reg_manager = Manager()
reg_html_dict = reg_manager.dict()

In [None]:
def get_region_html(e_name, e_url):
    if e_name in reg_html_dict:
        return
    att_urls = []
    MAX_PAGE = 99999
    OFFSET = 99999
    PAGE_TEMPLATE = None
    for page_num in range(MAX_PAGE):
        if page_num >= MAX_PAGE:
            break
        link = e_url if page_num == 0 else PAGE_TEMPLATE % (OFFSET * page_num)
        response = requests.get(TA_BASE + link, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        time.sleep(random.uniform(0, 2))
        
        # get pagination info
        if page_num == 0:
            page_soup = soup.find('div', {'class': 'pages'})
            if page_soup is None:
                # no restaurants at all
                break
            page_soup = page_soup.findAll('span', {'class': "pagination-box"})
            if len(page_soup) == 1:
                # only one page
                MAX_PAGE = 1
            else:
                # multiple pages
                MAX_PAGE = int(page_soup[-1].text)
                OFFSET = int(page_soup[1].find('a')['data-offset'])
                PAGE_TEMPLATE = page_soup[1].find('a')['href'].replace('oa'+str(OFFSET), 'oa%d')
            print('[%s] %d          ' % (e_name, MAX_PAGE))

        scripts = [(len(str(script)), script) for script in soup.findAll('script')]
        info_script = sorted(scripts, key=lambda x: x[0])[-1][1]
        info_script = str(info_script)
        info_script = info_script[
            info_script.index('pageManifest:') + len('pageManifest:'):
            info_script.index('};(window.$WP')
        ]
        responses = json.loads(info_script)['redux']['api']['responses']
        for response, response_script in responses.items():
            if 'restaurants' in response_script['data']:
                restaurants = response_script['data']['restaurants']
        att_urls.extend(restaurants)
    
    reg_html_dict[e_name] = att_urls


with Pool(10) as p:
    p.starmap(
        get_region_html, 
        list(zip(region_dict.keys(), region_dict.values()))
    )
    
with open('region_htmls.pickle', 'wb') as fp:
    pickle.dump(dict(reg_html_dict), fp)

### Step 3
Subsequently, restaurant details are retrieved in the same folder named ```attractions```.

In [None]:
with open('region_htmls.pickle', 'rb') as fp:
    reg_html_dict = pickle.load(fp)
    
att_urls = []
for e_name, e_items in tqdm(reg_html_dict.items()):
    for e_item in e_items:
        att_urls.append(e_item['detailPageUrl'])
att_urls = set(att_urls)
print(len(att_urls))

reg_html_dict = None

In [None]:
def get_attraction_html(e_url):
    uid = re.search('g\d+-d\d+', e_url).group(0)
    fn = 'attractions/' + uid
    if os.path.isfile(fn):
        return
    print(uid)
    response = requests.get(TA_BASE + e_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    [x.extract() for x in soup.findAll('style')]
    [x.extract() for x in soup.findAll('link')]
    with open(fn, 'w') as f:
        f.write(soup.prettify())
    time.sleep(random.uniform(0, 2))
    return
    

with Pool(10) as p:
    p.map(get_attraction_html, att_urls)

### Step 4
Finally, useful attributes are extracted from each restaurant html file.

In [None]:
att_manager = Manager()
att_dict = att_manager.dict()

In [None]:
def extract_att_html(fn):
    if fn in att_dict:
        return
    print(fn)
    loc_id = fn[fn.index('-d') + len('-d'):]
    with open(fn, 'r') as f:
        soup = BeautifulSoup(f.read(), 'html.parser')
    info_script = None
    for script in soup.findAll('script'):
        if '__WEB_CONTEXT__' in str(script):
            info_script = str(script)
    if info_script is None:
        # remove the file and rerun Step 3
        print('Corrupted file found: %s' % fn)
    info_script = info_script[
        info_script.index('pageManifest:') + len('pageManifest:'):
        info_script.index('};(window.$WP')
    ]
    responses = json.loads(info_script)['redux']['api']['responses']
    l_soup = responses['/data/1.0/location/%s' % loc_id]['data']
    r_soup = responses['/data/1.0/restaurant/%s/overview' % loc_id]['data']

    # one can investigate the json to extract more attributes
    att_dict[fn] = {
        'detailId': r_soup['detailId'],
        'geoId': r_soup['geoId'],
        #
        'latitude': r_soup['location']['latitude'], 
        'longitude': r_soup['location']['longitude'], 
        'landmark': r_soup['location']['landmark'], 
        'neighborhood': r_soup['location']['neighborhood'],
        #
        'address': r_soup['contact']['address'], 
        'email': r_soup['contact']['email'], 
        'phone': r_soup['contact']['phone'], 
        'website': r_soup['contact']['website'], 
        #
        'primaryRanking': r_soup['rating']['primaryRanking'], 
        'secondaryRanking': r_soup['rating']['secondaryRanking'], 
        'primaryRating': r_soup['rating']['primaryRating'], 
        'reviewCount': r_soup['rating']['reviewCount'], 
        'ratingQuestions': r_soup['rating']['ratingQuestions'], 
        #
        'priceRange': r_soup['detailCard']['tagTexts']['priceRange']['tags'], 
        'cuisines': r_soup['detailCard']['tagTexts']['cuisines']['tags'], 
        'dietaryRestrictions': r_soup['detailCard']['tagTexts']['dietaryRestrictions']['tags'], 
        'meals': r_soup['detailCard']['tagTexts']['meals']['tags'], 
        'features': r_soup['detailCard']['tagTexts']['features']['tags'], 
        'establishmentType': r_soup['detailCard']['tagTexts']['establishmentType']['tags'],
        #
        'name': l_soup['name'],
        'awards': l_soup['awards'],
        
        'loc_raw_ranking': l_soup.get('raw_ranking', PH),
        'loc_ranking_position': l_soup.get('ranking_position', PH),
        'loc_ranking_denominator': l_soup.get('ranking_denominator', PH),
        'loc_ranking': l_soup.get('ranking', PH),
        'price_level': l_soup.get('price_level', PH),
        
        'address_obj': l_soup['address_obj'],
        'display_hours': l_soup['display_hours'],
    }


fns = glob.glob('attractions/*')
with Pool(10) as p:
    p.map(extract_att_html, fns)

In [None]:
df = pd.DataFrame(list(dict(att_dict).values()))
df.fillna(value=PH, inplace=True)
df.to_csv('ta_restaurants.csv', header=True, index=False)

In [3]:
display(pd.read_csv('ta_restaurants.csv').head(5))

Unnamed: 0,address,address_obj,awards,cuisines,detailId,dietaryRestrictions,display_hours,email,establishmentType,features,geoId,landmark,latitude,loc_ranking,loc_ranking_denominator,loc_ranking_position,loc_raw_ranking,longitude,meals,name,neighborhood,phone,priceRange,price_level,primaryRanking,primaryRating,ratingQuestions,reviewCount,secondaryRanking,website
0,"16 - 20 Leeds St, Footscray, Maribyrnong, Vict...","{'street1': '16 - 20 Leeds St', 'street2': Non...","[{'award_type': 'CERTIFICATE_OF_EXCELLENCE', '...","[{'tagId': 10642, 'tagValue': 'Cafe'}, {'tagId...",7845043,"[{'tagId': 10665, 'tagValue': 'Vegetarian Frie...",_,_,"[{'tagId': 10591, 'tagValue': 'Restaurants'}]","[{'tagId': 10603, 'tagValue': 'Outdoor Seating...",2062777,_,-37.798176,#27 of 175 places to eat in Footscray,146,27,3.277150630950928,144.90114,"[{'tagId': 10597, 'tagValue': 'Breakfast'}, {'...",Rudimentary,_,+61 497 058 173,"[{'tagId': 10955, 'tagValue': 'Mid-range'}]",$$ - $$$,"{'rank': 6, 'totalCount': 15, 'category': 'Caf...",4.0,"[{'name': 'Food', 'rating': 40, 'icon': 'resta...",69,"{'rank': 25, 'totalCount': 143, 'category': 'R...",V0FBX2h0dHBzOi8vd3d3LmZhY2Vib29rLmNvbS9SdWRpbW...
1,"61 Glen Huntly Rd, Elwood, Port Phillip, Victo...","{'street1': '61 Glen Huntly Rd', 'street2': No...",[],[],4780531,[],"[{'days': 'Sun', 'times': ['5:00 pm - 11:00 pm...",_,"[{'tagId': 10591, 'tagValue': 'Restaurants'}]",[],1006517,_,-37.88188,#44 of 55 places to eat in Elwood,46,44,2.8545069694519043,144.98215,"[{'tagId': 10599, 'tagValue': 'Dinner'}]",Aree Bah,Elwood,+61 3 9531 7225,[],,"{'rank': 37, 'totalCount': 38, 'category': 'Re...",2.5,[],3,_,_
2,"717 Rathdowne St, Melbourne, Victoria 3054 Aus...","{'street1': '717 Rathdowne St', 'street2': Non...",[],"[{'tagId': 10642, 'tagValue': 'Cafe'}]",5102044,[],"[{'days': 'Sun', 'times': ['8:00 am - 4:00 pm'...",_,"[{'tagId': 9900, 'tagValue': 'Coffee & Tea'}]",[],255100,<b>1.8 km</b> from Melbourne Zoo,-37.78809,"#3,147 of 4,987 places to eat in Melbourne",4052,3147,3.039215087890625,144.97186,[],North Cafeteria,_,+61 3 9348 1276,"[{'tagId': 10955, 'tagValue': 'Mid-range'}]",$$ - $$$,"{'rank': 227, 'totalCount': 268, 'category': '...",3.5,[],7,_,ZVBVX2h0dHA6Ly93d3cuZmFjZWJvb2suY29tL25vcnRoY2...
3,"422 George St, Fitzroy, Yarra, Victoria 3065 A...","{'street1': '422 George St', 'street2': None, ...",[],"[{'tagId': 10681, 'tagValue': 'Australian'}]",4791186,[],_,_,"[{'tagId': 10591, 'tagValue': 'Restaurants'}]",[],1078374,_,-37.79677,#212 of 274 places to eat in Fitzroy,234,212,3.01935076713562,144.98244,[],20ft Monster,Fitzroy,+61 403 026 466,[],,"{'rank': 64, 'totalCount': 66, 'category': 'Au...",5.0,[],1,"{'rank': 185, 'totalCount': 207, 'category': '...",_
4,"390 Bridge Rd, Richmond, Yarra, Victoria 3121 ...","{'street1': '390 Bridge Rd', 'street2': None, ...",[],"[{'tagId': 10681, 'tagValue': 'Australian'}]",14140919,[],_,admin@frozenbyathousandblessings.com.au,"[{'tagId': 9909, 'tagValue': 'Dessert'}]",[],635736,_,-37.81932,#200 of 352 places to eat in Richmond,305,200,3.0963542461395264,145.00449,[],Frozen By A Thousand Blessings,_,+61 3 9421 0880,[],,"{'rank': 3, 'totalCount': 5, 'category': 'Dess...",4.5,[],4,_,SXlQX2h0dHA6Ly93d3cuZnJvemVuYnlhdGhvdXNhbmRibG...
