# Airbnb Neighborhood Info Scraping

First, run the code below to import the `requests` and `BeautifulSoup` libraries, as well as some other libraries we will be using.

In [1]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time
import re
import sys

In [84]:
def get_cities(url):
    src = requests.get(url).text
    soup = BeautifulSoup(src)
    cities = soup.select('a')
    listofcities = []
    for item in cities:
        stritem = str(item)
        shortitem = stritem[9:-4]
        if shortitem.startswith("/locations/"):
            listofcities.append(shortitem)
    return listofcities
get_cities("https://www.airbnb.com/locations")

['/locations/tel-aviv/lev-hair">Lev HaIr',
 '/locations/tel-aviv">Tel Aviv',
 '/locations/seoul/samseongdong-coex">Samseongdong/COEX',
 '/locations/seoul">Seoul',
 '/locations/barcelona/el-raval">El Raval',
 '/locations/barcelona">Barcelona',
 '/locations/boston/cambridge">Cambridge',
 '/locations/boston">Boston',
 '/locations/tokyo/ginza">Ginza',
 '/locations/tokyo">Tokyo',
 '/locations/austin">Austin',
 '/locations/bangkok">Bangkok',
 '/locations/barcelona">Barcelona',
 '/locations/berlin">Berlin',
 '/locations/boston">Boston',
 '/locations/buenos-aires">Buenos Aires',
 '/locations/lake-tahoe">Lake Tahoe',
 '/locations/london">London',
 '/locations/los-angeles">Los Angeles',
 '/locations/mexico-city">Mexico City',
 '/locations/miami">Miami',
 '/locations/new-york">New York',
 '/locations/paris">Paris',
 '/locations/rio-de-janeiro">Rio de Janeiro',
 '/locations/rome">Rome',
 '/locations/san-francisco">San Francisco',
 '/locations/seoul">Seoul',
 '/locations/sydney">Sydney',
 '/locatio

In [87]:
citylist = get_cities("https://www.airbnb.com/locations")
citylist = list(set(citylist))
citydic = {}
for item in citylist:
    newlist = item.split(">")
    name = newlist[1]
    URL = "https://www.airbnb.com" + newlist[0][:-1]
    citydic[name] = URL
print citydic

{'Mexico City': 'https://www.airbnb.com/locations/mexico-city', 'Venice': 'https://www.airbnb.com/locations/venice', 'Samseongdong/COEX': 'https://www.airbnb.com/locations/seoul/samseongdong-coex', 'Paris': 'https://www.airbnb.com/locations/paris', 'Rio de Janeiro': 'https://www.airbnb.com/locations/rio-de-janeiro', 'Sydney': 'https://www.airbnb.com/locations/sydney', 'Washington DC': 'https://www.airbnb.com/locations/washington-dc', 'San Francisco': 'https://www.airbnb.com/locations/san-francisco', 'Lev HaIr': 'https://www.airbnb.com/locations/tel-aviv/lev-hair', 'Seoul': 'https://www.airbnb.com/locations/seoul', 'Buenos Aires': 'https://www.airbnb.com/locations/buenos-aires', 'Bangkok': 'https://www.airbnb.com/locations/bangkok', 'Berlin': 'https://www.airbnb.com/locations/berlin', 'Los Angeles': 'https://www.airbnb.com/locations/los-angeles', 'Tokyo': 'https://www.airbnb.com/locations/tokyo', 'Miami': 'https://www.airbnb.com/locations/miami', 'Barcelona': 'https://www.airbnb.com/loc

In [88]:
def get_neighborhoods(url):
    src = requests.get(url).text
    soup = BeautifulSoup(src)
    neighborhoods = soup.select('div.span3')
    listofneighborhoods = []
    for column in neighborhoods:
        hoodlist = column.select('li')
        for hood in hoodlist:
            hood = str(hood)
            hood = hood[13:-9]
            listofneighborhoods.append(hood)
    return listofneighborhoods

neighborhoodlist = get_neighborhoods('https://www.airbnb.com/locations/san-francisco/neighborhoods')
print neighborhoodlist

['/locations/san-francisco/alamo-square">Alamo Square', '/locations/san-francisco/bayview">Bayview', '/locations/san-francisco/bernal-heights">Bernal Heights', '/locations/san-francisco/chinatown">Chinatown', '/locations/san-francisco/civic-center">Civic Center', '/locations/san-francisco/cole-valley">Cole Valley', '/locations/san-francisco/cow-hollow">Cow Hollow', '/locations/san-francisco/dogpatch">Dogpatch', '/locations/san-francisco/downtown">Downtown', '/locations/san-francisco/duboce-triangle">Duboce Triangle', '/locations/san-francisco/excelsior">Excelsior', '/locations/san-francisco/financial-district">Financial District', '/locations/san-francisco/fisherman-s-wharf">Fisherman\'s Wharf', '/locations/san-francisco/glen-park">Glen Park', '/locations/san-francisco/haight-ashbury">Haight-Ashbury', '/locations/san-francisco/hayes-valley">Hayes Valley', '/locations/san-francisco/inner-sunset">Inner Sunset', '/locations/san-francisco/japantown">Japantown', '/locations/san-francisco/lo

In [94]:
for item in neighborhoodlist:
    if not item.startswith("/locations/"):
        neighborhoodlist.remove(item)
        print "I removed something"
print neighborhoodlist

['/locations/san-francisco/alamo-square">Alamo Square', '/locations/san-francisco/bayview">Bayview', '/locations/san-francisco/bernal-heights">Bernal Heights', '/locations/san-francisco/chinatown">Chinatown', '/locations/san-francisco/civic-center">Civic Center', '/locations/san-francisco/cole-valley">Cole Valley', '/locations/san-francisco/cow-hollow">Cow Hollow', '/locations/san-francisco/dogpatch">Dogpatch', '/locations/san-francisco/downtown">Downtown', '/locations/san-francisco/duboce-triangle">Duboce Triangle', '/locations/san-francisco/excelsior">Excelsior', '/locations/san-francisco/financial-district">Financial District', '/locations/san-francisco/fisherman-s-wharf">Fisherman\'s Wharf', '/locations/san-francisco/glen-park">Glen Park', '/locations/san-francisco/haight-ashbury">Haight-Ashbury', '/locations/san-francisco/hayes-valley">Hayes Valley', '/locations/san-francisco/inner-sunset">Inner Sunset', '/locations/san-francisco/japantown">Japantown', '/locations/san-francisco/lo

In [95]:
neighborhooddic = {}
for item in neighborhoodlist:
    newlist = item.split(">")
    name = newlist[1]
    URL = "https://www.airbnb.com" + newlist[0][:-1]
    neighborhooddic[name] = URL
print neighborhooddic

{"Fisherman's Wharf": 'https://www.airbnb.com/locations/san-francisco/fisherman-s-wharf', 'Hayes Valley': 'https://www.airbnb.com/locations/san-francisco/hayes-valley', 'Portola': 'https://www.airbnb.com/locations/san-francisco/portola', 'Financial District': 'https://www.airbnb.com/locations/san-francisco/financial-district', 'Lower Haight': 'https://www.airbnb.com/locations/san-francisco/lower-haight', 'Japantown': 'https://www.airbnb.com/locations/san-francisco/japantown', 'Glen Park': 'https://www.airbnb.com/locations/san-francisco/glen-park', 'Tenderloin': 'https://www.airbnb.com/locations/san-francisco/tenderloin', 'Cow Hollow': 'https://www.airbnb.com/locations/san-francisco/cow-hollow', 'Parkside': 'https://www.airbnb.com/locations/san-francisco/parkside', 'Presidio Heights': 'https://www.airbnb.com/locations/san-francisco/presidio-heights', 'Bernal Heights': 'https://www.airbnb.com/locations/san-francisco/bernal-heights', 'Mission Terrace': 'https://www.airbnb.com/locations/sa

In [294]:
def clean_strings(string):
    cleanstring = string.replace("\xe2\x80\x99","'")
    cleanerstring = cleanstring.replace("\xe2\x80\x94"," - ")
    evencleanerstring = cleanerstring.replace("&amp;","&")
    yetcleanerstring = evencleanerstring.replace("\xc2\xa0"," ")
    cleaneststring = yetcleanerstring.replace("\xe2\x80\x93"," - ")
    return cleaneststring

Ok now we have a list of neighborhoods; how do we extract the information we want and write it to a CSV?

In [311]:
def get_descriptions(url):
    src = requests.get(url).text
    
    descriptions = {}
    soup = BeautifulSoup(src)
        
    descriptions["Name"] = soup.select('h1.circularbold')[0].text.encode("ascii", "ignore")
    print descriptions["Name"]
    
    intro = str(soup.select("div.description"))[48:-20]
    newintro = clean_strings(intro)
    descriptions["Intro"] = newintro
    
    lede = soup.select("p.lede")[0].text.encode("ascii", "ignore")
    #strlede = str(lede)[16:-4]
    #cleanerlede = clean_strings(strlede)
    descriptions["Lede"] = lede
    
    official_tags = soup.select("span.name")
    official_tag_list = []
    for tag in official_tags:
        justtag = (str(tag)[19:-7])
        cleantag = clean_strings(justtag)
        official_tag_list.append(cleantag)
    descriptions["Official_Tags"] = official_tag_list
    
    community_tags = soup.select("div.neighborhood-tag")
    community_tag_list = []
    for tag in community_tags:
        justtag = (str(tag)[30:]).split("\n")[0]
        community_tag_list.append(justtag)
    descriptions["Community_Tags"] = community_tag_list
    
    photo_captions = soup.select("div.primary")[1:]
    photo_caption_list = []
    for caption in photo_captions:
        justcaption = str(caption)[25:-11]
        cleancaption = clean_strings(justcaption)
        photo_caption_list.append(cleancaption)
    descriptions["Photo_Captions"] = photo_caption_list
    
    photo_headers = soup.select("h2")[1:-8]
    photo_header_list = []
    for header in photo_headers:
        justheader = str(header)[4:-5]
        photo_header_list.append(justheader)
    descriptions["Photo_Headers"] = photo_header_list
    
    block_quotes = soup.select("blockquote")
    block_quote_list = []
    for quote in block_quotes:
        justquote = str(quote)[12:-14]
        cleanquote = clean_strings(justquote)
        block_quote_list.append(cleanquote)
    descriptions["Block_Quotes"] = block_quote_list

    import csv
    filename = url[33:] + '_' + 'descriptions.csv'
    cleanfilename = filename.replace("/","_")
    writer = csv.writer(open(cleanfilename, 'wb'))
    print "I'm creating a CSV file..."
    for key, value in descriptions.items():
        writer.writerow([key, value])

    print "CSV file created!"

    return descriptions

get_descriptions('https://www.airbnb.com/locations/san-francisco/mission-district')

#missiondescriptions = get_descriptions('https://www.airbnb.com/locations/san-francisco/mission-district')
#print missiondescriptions

Mission District
I'm creating a CSV file...
CSV file created!


{'Block_Quotes': ["For those who don't know the Mission, it's one of the most interesting and diverse neighborhoods in SF...You'll find Mexican grocery stores, hipster restaurants and home decor stores, dive bars, activist bookstores, and amazing street art. It's a slice of SF unlike anything you'll find in the places where hotels are located. I recommend getting a BART card or renting a bike to get around.",
  '24th Street is filled with great taquerias, cafes, fun shopping, and some of the craziest and inventive ice-cream and doughnuts in the world.',
  'Staying in the Mission means you get all the fun of San Francisco, with all the fun people. This is seriously the best part of SF, and the Mission is known for its eateries, coffee shops, book stores, art galleries and bars. The Mission is awesome.',
  "Born and raised in LA, Aubrie Pick moved to San Francisco a decade ago to attend the Art Institute. She's called the Mission home since 2005,  and loves nothing more than drinking mor

In [291]:
for item in neighborhooddic:
    try:
        get_descriptions(neighborhooddic[item])
    except IndexError: # catch the error
        pass
    print "I made a CSV for " + item + "!"

Fisherman's Wharf
I'm creating a CSV file...
CSV file created!
I made a CSV for Fisherman's Wharf!
Hayes Valley
I'm creating a CSV file...
CSV file created!
I made a CSV for Hayes Valley!
Portola
I'm creating a CSV file...
CSV file created!
I made a CSV for Portola!
Financial District
I'm creating a CSV file...
CSV file created!
I made a CSV for Financial District!
Lower Haight
I'm creating a CSV file...
CSV file created!
I made a CSV for Lower Haight!

I made a CSV for Japantown!
Glen Park
I'm creating a CSV file...
CSV file created!
I made a CSV for Glen Park!
Tenderloin
I'm creating a CSV file...
CSV file created!
I made a CSV for Tenderloin!
Cow Hollow
I'm creating a CSV file...
CSV file created!
I made a CSV for Cow Hollow!
Parkside
I'm creating a CSV file...
CSV file created!
I made a CSV for Parkside!
Presidio Heights
I'm creating a CSV file...
CSV file created!
I made a CSV for Presidio Heights!
Bernal Heights
I'm creating a CSV file...
CSV file created!
I made a CSV for Bernal