In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import time

tqdm.monitor_interval = 0

In [193]:
# Base search parameters
payload = {"from": "advanced", "SelectSearchBy": "city",
           "SEARCH_state[CA]": "CA", 
           "SEARCH_county[Sonoma County]": "Sonoma County",
           "SEARCH_city[Rohnert Park]": "Rohnert Park",
           "SEARCH_city[Cotati]": "Cotati",
           "SEARCH_city[Santa Rosa]": "Santa Rosa",
           "SEARCH_city[Windsor]": "Windsor",
           "SEARCH_city[Healdsburg]": "Healdsburg",
           "SEARCH_city[Petaluma]": "Petaluma",
           "SEARCH_status[S]": "S",
           "SEARCH_sold_date_range": "3_yr",
           "SEARCH_type[Single Family Home]": "Single Family Home",
           "SEARCH_type[Condo/Townhouse]": "Condo/Townhouse",
           "SEARCH_bedrooms": "0",
           "SEARCH_baths": "0",
           "SEARCH_minprice": "0",
           "SEARCH_maxprice": "0",
           "SEARCH_lotsize": "0",
           "SEARCH_rooms": "0",
           "SEARCH_sqft": "0",
           "SEARCH_units": "0",
           "SEARCH_maxyear_built": "0",
           "SEARCH_minyear_built": "0"
          }

# Headers to ensure consistent page encoding
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}

# Base URL for the search query. Removed site due to changes in their site terms and robots.txt
search_url = "REMOVED"

In [187]:
# Instantiate the requests instance
session = requests.Session(headers=headers)

In [194]:
# Submit search query and load the initial page
page = session.post(search_url, data=payload, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')

In [None]:
# Validating the site page content
soup

In [53]:
# Additional URLs that were used for testing
base_url = "REMOVED"
target_url = "REMOVED"

In [196]:
# Validating the results from the additional tests
page = requests.get(target_url)
soup = BeautifulSoup(page.content, 'html.parser')

In [127]:
# Review the listing features and their format
soup.findAll('li', {'class': 'listing-feature'})

[<li class="listing-feature"><b>Property Type:</b> Commercial Lease</li>,
 <li class="listing-feature"><b>Sub Type:</b> Mixed Use</li>,
 <li class="listing-feature"><b>Listing Status:</b> Active</li>,
 <li class="listing-feature"><b>County/Area:</b> Solano County</li>,
 <li class="listing-feature"><b>Zip Code:</b> 94590</li>,
 <li class="listing-feature"><b>Year Built:</b> 1950</li>,
 <li class="listing-feature"><b>Sq.Ft.:</b> 4,100 sq ft</li>,
 <li class="listing-feature"><b>Stories:</b> 1 Story</li>,
 <li class="listing-feature"><b>Lot Size:</b> 0.65 acres</li>,
 <li class="listing-feature"><b>Accessibility:</b> Doors, Parking</li>,
 <li class="listing-feature"><b>Zoning:</b> Commercial</li>,
 <li class="listing-feature"><b>Area:</b> Vallejo 4</li>,
 <li class="listing-feature"><b># of Buildings:</b> 1</li>,
 <li class="listing-feature"><b>Ceilings:</b> 12'-18'</li>,
 <li class="listing-feature"><b>Close To:</b> Freeway, Public Transportatio, Restaurants</li>,
 <li class="listing-fea

In [None]:
"""
Listing addresses were initially collected during dataset 
creation for manual data validation, but were not used in
the actual analysis.
"""

# Blank dataframe to save data to
data = pd.DataFrame()

breakNext = False
i = 0
start_time = time.time()

while True:
    page = session.get(target_url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    if len(soup.findAll('a', {'class': 'next'})) == 0:
        breakNext = True
    
    targetURL = (base_url + soup.findAll('a', {'class': 'next'})[0]['href'])
    features = soup.findAll('li', {'class': 'listing-feature'})
    
    if len(soup.findAll('a', {'class': 'listing-street-address'})):
        data.loc[i, 'StreetAddress'] = \
            soup.findAll('a', {'class': 'listing-street-address'})[0].text
    
    if len(soup.findAll('a', {'class': 'listing-address-city'})):
        data.loc[i, 'City'] = \
            soup.findAll('a', {'class': 'listing-address-city'})[0].text
        
    if len(soup.findAll('span', {'class': 'listing-mls-number mls-number'})):
        data.loc[i, 'MLS'] = \
            soup.findAll('span', 
                         {'class': 'listing-mls-number mls-number'})[0].text.split("#")[1]
        
    """
    Automatically create new features based on which listing-features are included.
    The resulting columns will need to be cleaned up as a result of this method,
    but it ensures all fields are captured. The site had inconsistent features
    for different listings. Dealing with many unique features works well for the
    purpose of this paper.
    """
    for feature in features:
        data.loc[i, feature.text.split(':')[0]] = feature.text.split(':')[1]
    
    # Save a copy of the dataset every 100 iterations
    if i % 100 == 0:
        data.to_csv('housingData.csv')
        elapsed_time = time.time() - start_time
        print("Iteration: ", i, 
              "Iteration Time: ", 
              elapsed_time, " Data Shape: ", 
              data.shape)
        start_time = time.time()
    
    i+=1
    if breakNext:
        break

In [200]:
data.to_csv('housingData.csv')

In [219]:
# Exporting table for LaTeX
with open('mytable.tex','w') as tf:
    tf.write(pd.read_csv('housingData.csv', index_col=0, parse_dates=['Sold Date']).head(2).transpose().to_latex())

  interactivity=interactivity, compiler=compiler, result=result)
