## AirBNB webscraping

In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [2]:
def get_room_sites(websites):
    
    """
    This function takes a list of websites from AirBNB 
    uses Selenium to open each website and parse out the 
    innerHTML files. The innerHTML file is then used by 
    Beautiful Soup to find all the a tags and recursively 
    go over them to remove the href tags within it. The 
    function returns a list with all duplicates removed.
    """
    
    chromedriver = '/Applications/chromedriver'
    os.environ['webdriver.chrome.driver'] = chromedriver
    driver = webdriver.Chrome(chromedriver)
    time.sleep(1)
    
    room_list = []

    for website in websites:
        driver.get(website)
        time.sleep(1)
        innerHTML = driver.execute_script('return document.body.innerHTML')
        soup = BeautifulSoup(innerHTML, 'lxml')
        s=soup.find_all('a')
        for i,v in enumerate(s):
            m = s[i]['href']
            room_list.append(m)


    driver.quit()
    return list(set(room_list))

In [5]:
websites = ['https://www.airbnb.com/s/homes?refinement_paths%5B%5D=%2Fhomes&adults=1&children=0&infants=0&toddlers=0&query=Chicago%2C%20IL%2C%20United%20States&place_id=ChIJ7cv00DwsDogRAMDACa2m4K8&allow_override%5B%5D=&s_tag=AQZXn8MF',
    'https://www.airbnb.com/s/homes?refinement_paths%5B%5D=%2Fhomes&adults=1&children=0&infants=0&toddlers=0&query=Chicago%2C%20IL%2C%20United%20States&place_id=ChIJ7cv00DwsDogRAMDACa2m4K8&allow_override%5B%5D=&s_tag=AQZXn8MF&section_offset=1',
    'https://www.airbnb.com/s/homes?refinement_paths%5B%5D=%2Fhomes&adults=1&children=0&infants=0&toddlers=0&query=Chicago%2C%20IL%2C%20United%20States&place_id=ChIJ7cv00DwsDogRAMDACa2m4K8&allow_override%5B%5D=&s_tag=AQZXn8MF&section_offset=2',
    'https://www.airbnb.com/s/homes?refinement_paths%5B%5D=%2Fhomes&adults=1&children=0&infants=0&toddlers=0&query=Chicago%2C%20IL%2C%20United%20States&place_id=ChIJ7cv00DwsDogRAMDACa2m4K8&allow_override%5B%5D=&s_tag=AQZXn8MF&section_offset=3',
    'https://www.airbnb.com/s/homes?refinement_paths%5B%5D=%2Fhomes&adults=1&children=0&infants=0&toddlers=0&query=Chicago%2C%20IL%2C%20United%20States&place_id=ChIJ7cv00DwsDogRAMDACa2m4K8&allow_override%5B%5D=&s_tag=AQZXn8MF&section_offset=4',
    'https://www.airbnb.com/s/homes?refinement_paths%5B%5D=%2Fhomes&adults=1&children=0&infants=0&toddlers=0&query=Chicago%2C%20IL%2C%20United%20States&place_id=ChIJ7cv00DwsDogRAMDACa2m4K8&allow_override%5B%5D=&s_tag=AQZXn8MF&section_offset=5',
    'https://www.airbnb.com/s/homes?refinement_paths%5B%5D=%2Fhomes&adults=1&children=0&infants=0&toddlers=0&query=Chicago%2C%20IL%2C%20United%20States&place_id=ChIJ7cv00DwsDogRAMDACa2m4K8&allow_override%5B%5D=&s_tag=AQZXn8MF&section_offset=6',
    'https://www.airbnb.com/s/homes?refinement_paths%5B%5D=%2Fhomes&adults=1&children=0&infants=0&toddlers=0&query=Chicago%2C%20IL%2C%20United%20States&place_id=ChIJ7cv00DwsDogRAMDACa2m4K8&allow_override%5B%5D=&s_tag=AQZXn8MF&section_offset=7',
    'https://www.airbnb.com/s/homes?refinement_paths%5B%5D=%2Fhomes&adults=1&children=0&infants=0&toddlers=0&query=Chicago%2C%20IL%2C%20United%20States&place_id=ChIJ7cv00DwsDogRAMDACa2m4K8&allow_override%5B%5D=&s_tag=AQZXn8MF&section_offset=8',
    'https://www.airbnb.com/s/homes?refinement_paths%5B%5D=%2Fhomes&adults=1&children=0&infants=0&toddlers=0&query=Chicago%2C%20IL%2C%20United%20States&place_id=ChIJ7cv00DwsDogRAMDACa2m4K8&allow_override%5B%5D=&s_tag=AQZXn8MF&section_offset=9',
    'https://www.airbnb.com/s/homes?refinement_paths%5B%5D=%2Fhomes&adults=1&children=0&infants=0&toddlers=0&query=Chicago%2C%20IL%2C%20United%20States&place_id=ChIJ7cv00DwsDogRAMDACa2m4K8&allow_override%5B%5D=&s_tag=AQZXn8MF&section_offset=1',
    'https://www.airbnb.com/s/homes?refinement_paths%5B%5D=%2Fhomes&adults=1&children=0&infants=0&toddlers=0&query=Chicago%2C%20IL%2C%20United%20States&place_id=ChIJ7cv00DwsDogRAMDACa2m4K8&allow_override%5B%5D=&s_tag=AQZXn8MF&section_offset=1',
    'https://www.airbnb.com/s/homes?refinement_paths%5B%5D=%2Fhomes&adults=1&children=0&infants=0&toddlers=0&query=Chicago%2C%20IL%2C%20United%20States&place_id=ChIJ7cv00DwsDogRAMDACa2m4K8&allow_override%5B%5D=&s_tag=AQZXn8MF&section_offset=12',
    'https://www.airbnb.com/s/homes?refinement_paths%5B%5D=%2Fhomes&adults=1&children=0&infants=0&toddlers=0&query=Chicago%2C%20IL%2C%20United%20States&place_id=ChIJ7cv00DwsDogRAMDACa2m4K8&allow_override%5B%5D=&s_tag=AQZXn8MF&section_offset=13',
    'https://www.airbnb.com/s/homes?refinement_paths%5B%5D=%2Fhomes&adults=1&children=0&infants=0&toddlers=0&query=Chicago%2C%20IL%2C%20United%20States&place_id=ChIJ7cv00DwsDogRAMDACa2m4K8&allow_override%5B%5D=&s_tag=AQZXn8MF&section_offset=14',
    'https://www.airbnb.com/s/homes?refinement_paths%5B%5D=%2Fhomes&adults=1&children=0&infants=0&toddlers=0&query=Chicago%2C%20IL%2C%20United%20States&place_id=ChIJ7cv00DwsDogRAMDACa2m4K8&allow_override%5B%5D=&s_tag=AQZXn8MF&section_offset=15',
    'https://www.airbnb.com/s/homes?refinement_paths%5B%5D=%2Fhomes&adults=1&children=0&infants=0&toddlers=0&query=Chicago%2C%20IL%2C%20United%20States&place_id=ChIJ7cv00DwsDogRAMDACa2m4K8&allow_override%5B%5D=&s_tag=AQZXn8MF&section_offset=16']

room_list=get_room_sites(websites)

In [6]:
def clean_room_list(list1):
    """
    This function takes a list of websites from 
    get_room_sites function and cleans it up to 
    remove any website that does not fit the 
    /rooms/... format of individual website urls. 
    Additionally, this function adds the www.airbnb.com 
    to each url to make sure it is usable for scraping.
    """
    clean_room_list = []
    for item in list1:
        if 'rooms/' in str(item):
            item = 'www.airbnb.com' + item
            clean_room_list.append(item)
        else:
            list1.remove(item)
            
    return clean_room_list

new_room_list = clean_room_list(room_list)

In [7]:
len(new_room_list)

265

In [8]:
new_room_list

['www.airbnb.com/rooms/10411427?adults=1&children=0&infants=0&toddlers=0',
 'www.airbnb.com/rooms/18240098?adults=1&children=0&infants=0&toddlers=0',
 'www.airbnb.com/rooms/14942835?adults=1&children=0&infants=0&toddlers=0',
 'www.airbnb.com/rooms/plus/15289972?adults=1&children=0&infants=0&toddlers=0',
 'www.airbnb.com/rooms/16592678?adults=1&children=0&infants=0&toddlers=0',
 'www.airbnb.com/rooms/15486188?adults=1&children=0&infants=0&toddlers=0',
 'www.airbnb.com/rooms/25364757?adults=1&children=0&infants=0&toddlers=0',
 'www.airbnb.com/rooms/10252775?adults=1&children=0&infants=0&toddlers=0',
 'www.airbnb.com/rooms/14058443?adults=1&children=0&infants=0&toddlers=0',
 'www.airbnb.com/rooms/6957805?adults=1&children=0&infants=0&toddlers=0',
 'www.airbnb.com/rooms/13731634?adults=1&children=0&infants=0&toddlers=0',
 'www.airbnb.com/rooms/7482058?adults=1&children=0&infants=0&toddlers=0',
 'www.airbnb.com/rooms/plus/19182630?adults=1&children=0&infants=0&toddlers=0',
 'www.airbnb.com/

## WalkScore Webscraping

In [None]:
url='https://www.walkscore.com/IL/Chicago'
tables = pd.read_html(url)

In [None]:
def copy_html_table(url):
    """"""