In [3]:
from bs4 import BeautifulSoup
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
import pandas as pd
import time
import pymongo
from pymongo import MongoClient


In [4]:
class timeit():
    from datetime import datetime
    def __enter__(self):
        self.tic = self.datetime.now()
    def __exit__(self, *args, **kwargs):
        print('runtime: {}'.format(self.datetime.now() - self.tic))

In [5]:
def init_browser():
    # @NOTE: Replace the path with your actual path to the chromedriver
    executable_path = {"executable_path": "chromedriver.exe"}
    return Browser("chrome", **executable_path, headless=False)

In [6]:
# Data to collect from Airbnb and TripAdvisor: 
# url / about / name / price / lat_lon / address (zip code for Airbnb) / review_num / rating / photos (4)  
def scrape(items, base_url, key):
    with timeit():
        browser = init_browser()
        data = []
        i = 0
        for item in items:
            if ("plus" in item) == False:
                i+=1
                url = base_url+item
                browser.visit(url)
                time.sleep(1)
                try:
                    details = {}
                    photos = []

                    soup = BeautifulSoup(browser.html, "html.parser")

                    # Get name
                    name = soup.find("span", class_="_18hrqvin").text

                    # Get price
                    price = soup.find('span', class_='_doc79r').text

                    # Get description
                    try:
                        about = soup.find('meta', attrs={'property': 'og:description'}).attrs['content']
                    except:
                        about = ""

                    # Get latitude and logitude
                    try:
                        lat_lon = soup.find('div', class_='_59m2yxn').img.attrs['src'].split('center=')[1].split('&')[0].split(',')
                    except:
                        lat_lon = ""
                        print("LAT_LON does not exist for: " + url)
                        print("Index #" + i)
                        
                    # Get address
                    address = soup.find('div', class_='_czm8crp').text

                    # Get review number
                    review_num = soup.find('span', class_='_s1tlw0m').text.split(" ")[0]

                    # Get rating
                    rating = soup.find('div', attrs={'itemprop': 'ratingValue'}).attrs['content']

                    # Get photos
                    photos_text = soup.find_all("img", class_="_uttz43")
                    i=0 
                    for photo in photos_text:
                        i+=1;
                        if (i<5):
                            photos.append(photo.get('src'))
                        else:
                            break;

                    details = {
                        "listing_url": url,
                        "listing_name": name,
                        "listing_price": price,
                        "listing_about": about,
                        "listing_lat_lon":lat_lon,
                        "listing_address": address,
                        "listing_review_num": review_num,
                        "listing_rating": rating,
                        "listing_photos": photos
                    }
                except:
                    print("Error : " + url)
                details.pop('_id', None)
                data.append(details)
        browser.quit()
        print("City Processed: " + key.upper())
    return data

In [7]:
listings = {}

browser = init_browser()

base_url = "https://www.airbnb.com"
uris = [{"bangkok":"/s/homes?refinement_paths%5B%5D=%2Fhomes&query=bangkok&search_type=PAGINATION&place_id=ChIJ82ENKDJgHTERIEjiXbIAAQE&allow_override%5B%5D=&s_tag=AJu-_zdr"},
        {"london":"/s/homes?refinement_paths%5B%5D=%2Fhomes&query=london&search_type=PAGINATION&place_id=ChIJdd4hrwug2EcRmSrV3Vo6llI&allow_override%5B%5D=&s_tag=ngrx0aBx"},
        {"paris":"/s/homes?refinement_paths%5B%5D=%2Fhomes&query=paris&search_type=PAGINATION&place_id=ChIJD7fiBh9u5kcRYJSMaMOCCwQ&allow_override%5B%5D=&s_tag=HjrJdCef"},
        {"dubai":"/s/homes?refinement_paths%5B%5D=%2Fhomes&query=dubai&search_type=PAGINATION&place_id=ChIJRcbZaklDXz4RYlEphFBu5r0&allow_override%5B%5D=&s_tag=1WyaY8Hc"},
        {"singapore":"/s/homes?refinement_paths%5B%5D=%2Fhomes&query=Singapore&search_type=PAGINATION&place_id=ChIJyY4rtGcX2jERIKTarqz3AAQ&allow_override%5B%5D=&s_tag=5_9M7Pyo"},
        {"nyc":"/s/homes?refinement_paths%5B%5D=%2Fhomes&query=NYC&search_type=PAGINATION&place_id=ChIJOwg_06VPwokRYv534QaPC8g&allow_override%5B%5D=&s_tag=fQ_izRAK"},
        {"kl":"/s/homes?refinement_paths%5B%5D=%2Fhomes&query=kuala%20lumpur&search_type=PAGINATION&place_id=ChIJ0-cIvSo2zDERmWzYQPUfLiM&allow_override%5B%5D=&s_tag=6l0JIJ2m"},
        {"tokyo":"/s/homes?refinement_paths%5B%5D=%2Fhomes&query=tokyo&search_type=PAGINATION&place_id=ChIJ51cu8IcbXWARiRtXIothAS4&allow_override%5B%5D=&s_tag=_lKVH32b"},
        {"istanbul":"/s/homes?refinement_paths%5B%5D=%2Fhomes&query=Istanbul&search_type=PAGINATION&place_id=ChIJawhoAASnyhQR0LABvJj-zOE&allow_override%5B%5D=&s_tag=tTxXke7o"},
        {"seoul":"/s/homes?refinement_paths%5B%5D=%2Fhomes&query=Seoul&search_type=PAGINATION&place_id=ChIJzWXFYYuifDUR64Pq5LTtioU&allow_override%5B%5D=&s_tag=qG-9kD9F"}
       ]

for uri in uris:
    for key, value in uri.items():
        items = []
        try:
            browser.visit(base_url+value)
            while True:
                time.sleep(1)
                soup = BeautifulSoup(browser.html, 'html.parser')
                page = soup.find_all('div', attrs={'itemprop': 'itemListElement'})
                for item in page:
                    link = item.find('a').get('href')
                    items.append(link)        
                browser.find_by_css('svg[aria-label="Next"]').click()
        except:
            print("City: " + key.upper() + " - URL gathering completed")
        
        listings[key] = scrape(items, base_url, key)
browser.quit()

City: BANGKOK - URL gathering completed
Error : https://www.airbnb.com/rooms/9665557
City Processed: bangkok
49999995000000
runtime: 0:27:12.846490
City: LONDON - URL gathering completed
City Processed: london
49999995000000
runtime: 0:29:41.604687
City: PARIS - URL gathering completed
runtime: 0:02:15.411640


KeyboardInterrupt: 

In [None]:
listings_copy = listings.copy()

In [None]:
len(listings_copy)

In [None]:
listings_copy.keys()

In [None]:
listings_copy.pop('_id', None)

In [None]:
# Initialize PyMongo to work with MongoDBs
# connect = 'mongodb://localhost:27017'
connect = 'mongodb+srv://yuj:explorer2019@city-explorer-ocvlm.mongodb.net/test?retryWrites=true'
client = MongoClient(connect)

In [None]:
# Define database and collection
db = client.city_explorer
city_airbnb = db.city_airbnb 

In [None]:
city_airbnb.insert_one(listings_copy)