In [None]:
from bs4 import BeautifulSoup
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
import pandas as pd
import time
import pymongo
from pymongo import MongoClient

In [None]:
class timeit():
    from datetime import datetime
    def __enter__(self):
        self.tic = self.datetime.now()
    def __exit__(self, *args, **kwargs):
        print('runtime: {}'.format(self.datetime.now() - self.tic))

In [None]:
def init_browser():
    # @NOTE: Replace the path with your actual path to the chromedriver
    executable_path = {"executable_path": "chromedriver.exe"}
    return Browser("chrome", **executable_path, headless=False)

In [None]:
# Data to collect from Airbnb and TripAdvisor: 
# url / about / name / price / lat_lon / address (zip code for Airbnb) / review_num / rating / photos (4)  
def scrape(items, base_url, key):
    with timeit():
        browser = init_browser()
        data = []
        i = 0
        for item in items:
            if ("plus" in item) == False:
                i+=1
                url = base_url+item
                browser.visit(url)
                time.sleep(5)
                try:
                    details = {}
                    photos = []

                    soup = BeautifulSoup(browser.html, "html.parser")

                    # Get name
                    name = soup.find("h1", class_="ui_header h1").text

                    # Get price
                    price = soup.find('div', class_='price_night').div.text.split('$')[1]

                    # Get description
                    try:
                        about = soup.find('div', class_='common-text-ReadMore__content--2X4LR').text
                    except:
                        about = ""

                    # Get latitude and logitude
                    try:
                        lat_lon = soup.find('img', class_='hotels-hotel-review-location-StaticMap__map--3L4sb').attrs['src'].split('center=')[1].split('&')[0].split(',')
                    except:
                        lat_lon = ""
                        print("LAT_LON does not exist for: " + url)
                        print("Index #" + i)
                        
                    # Get address
                    address = soup.find('span', class_='public-business-listing-ContactInfo__ui_link--1_7Zp public-business-listing-ContactInfo__level_4--3JgmI').text

                    # Get review number
                    review_num = soup.find('span', class_='hotels-hotel-review-about-with-photos-Reviews__seeAllReviews--3PpLR').text.split(" ")[0]

                    # Get rating
                    rating = soup.find('span', class_='hotels-hotel-review-about-with-photos-Reviews__overallRating--vElGA').text  

                    # Get photos
                    photos_text = soup.find_all("div", class_="media-image-ResponsiveImage__default--1s-9x")
                    i=0 
                    for photo in photos_text:
                        i+=1;
                        if (i<5):
                            photos.append(photo.get('style').split('url(')[1].split(')')[0])
                        else:
                            break;

                    details = {
                        "listing_url": url,
                        "listing_name": name,
                        "listing_price": price,
                        "listing_about": about,
                        "listing_lat_lon":lat_lon,
                        "listing_address": address,
                        "listing_review_num": review_num,
                        "listing_rating": rating,
                        "listing_photos": photos
                    }
                except:
                    print("Error : " + url)
                details.pop('_id', None)
                data.append(details)
        browser.quit()
        print("City Processed: " + key.upper())
    return data

In [None]:
listings = {}

browser = init_browser()

base_url = "https://www.tripadvisor.com"
uris = [{"bangkok":"/Hotels-g293916-Bangkok-Hotels.html"},
        {"london":"/Hotels-g186338-London_England-Hotels.html"},
        {"paris":"/Hotels-g187147-Paris_Ile_de_France-Hotels.html"},
        {"dubai":"/Hotels-g295424-Dubai_Emirate_of_Dubai-Hotels.html"},
        {"singapore":"/Hotels-g294265-Singapore-Hotels.html"},
        {"nyc":"/Hotels-g60763-New_York_City_New_York-Hotels.html"},
        {"kl":"/Hotels-g298570-Kuala_Lumpur_Wilayah_Persekutuan-Hotels.html"},
        {"tokyo":"/Hotels-g298184-Tokyo_Tokyo_Prefecture_Kanto-Hotels.html"},
        {"istanbul":"/Hotels-g293974-Istanbul-Hotels.html"},
        {"seoul":"/Hotels-g294197-Seoul-Hotels.html"}
       ]

for uri in uris:
    for key, value in uri.items():
        items = []
        try:
            browser.visit(base_url+value)
            counter = 0
            state = True
            while state:
                time.sleep(5)
                soup = BeautifulSoup(browser.html, 'html.parser')
                page = soup.find_all('div', class_="listing_title")
                for item in page:
                    link = item.find('a').get('href')
                    items.append(link)        
    #                 browser.find_by_text('Next').last.click()
                browser.find_by_css('a[class="nav next taLnk ui_button primary"]').click()
                counter += 1
                if counter == 15:
                    state = False
        except:
            print("City: " + key.upper() + " - URL gathering completed")
        
        listings[key] = scrape(items, base_url, key)
browser.quit()

In [None]:
# Create copy of array for a backup source
listings_copy = listings.copy()

In [None]:
# total number of keys should be 10
len(listings_copy)

In [None]:
# print the list of keys
listings_copy.keys()

In [None]:
# remove _id from the list of keys
listings_copy.pop('_id', None)

In [None]:
# Initialize PyMongo to work with MongoDBs
# connect = 'mongodb://localhost:27017'
connect = 'mongodb+srv://yuj:explorer2019@city-explorer-ocvlm.mongodb.net/test?retryWrites=true'
client = MongoClient(connect)

In [None]:
# Define database and collection
db = client.city_explorer
city_tripadvisor = db.city_tripadvisor 

In [None]:
city_tripadvisor.insert_one(listings_copy)

In [None]:
# Debugging the list of lat_lon. There are values that might be empty
# df = pd.DataFrame.from_dict(listings_copy['singapore'])['listing_lat_lon']
# df

In [None]:
# Displays full list of output
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 500)

In [None]:
# Export dict to json for backup
# _id must be removed before using
import json
with open('result.json', 'w') as fp:
    json.dump(listings_copy, fp)

In [None]:
# Manual approach to clean out empty dictionaries
# dubai1 = {"dubai": [i for i in dict2['dubai'] if i]}
# kl1 = {"kl": [i for i in dict2['kl'] if i]}
# bangkok1 = {"bangkok": [i for i in dict2['bangkok'] if i]}
# london1 = {"london": [i for i in dict2['london'] if i]}
# paris1 = {"paris": [i for i in dict2['paris'] if i]}
# singapore1 = {"singapore": [i for i in dict2['singapore'] if i]}
# nyc1 = {"nyc": [i for i in dict2['nyc'] if i]}
# tokyo1 = {"tokyo": [i for i in dict2['tokyo'] if i]}
# istanbul1 = {"istanbul": [i for i in dict2['istanbul'] if i]}
# seoul1 = {"seoul": [i for i in dict2['seoul'] if i]}

In [None]:
# clean_listing = {}

In [None]:
#clean_listing.update(seoul1) # Must be done with all dictionaries after the cleaning