In [110]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
#import os
#import sys

#from selenium import webdriver

pd.set_option('display.max_columns', None)

#### Requests and Beautiful Soup: 

- define `zipcode` and `filter`
- Build urls by zipcode and filter with `construct_redfin_url(zipcode, filter_)`
- Get page contents and create `soup` object with function `make_soup` 
- Parse the landing page with `find_home_links` (with zipcode and filter applied) for links to individual sold homes. Return a list of relative links



In [165]:
#soup.find_all('a')
# soup.find_all("div", class_="HomeCardImage")
#print('The soup is {} long.'.format(len(soup.prettify())))

def construct_redfin_url(zipcode, filter_='sold-3yr'):
    """ Make a landing url that filters by zipcode and 'sold-3r' or 'sold-all'.
    """
    return 'https://www.redfin.com/zipcode/'+ zipcode + '/filter/include=' + filter_


def make_soup(url):
    """ Make a soup object from a url.
    """
    hdr = {'User-Agent': 'Mozilla/5.0'}

    response = requests.get(url, headers=hdr)

    assert response.status_code==200, "HTML code isn't 200."
    page = response.text
    print('Page is {} long.'.format(len(page)))
    
    return BeautifulSoup(page,"lxml")


def find_home_links(soup):
    """
    Finds all the relative individual house data links on a landing page. 
    """
    home_links = []
    for link in soup.find_all("a", class_="cover-all"):
        home_links.append(link['href'])
    
    return home_links


def parse_home_links(home_link):
    """
    Finds all the relative individual house data links on a landing page. 
    """
    url = 'https://www.redfin.com/'+ home_link
    
    home_soup = make_soup(url)
    return home_soup
    
    



links = find_home_links(soup)
print('The list of links is {} elements long'.format(len(links)))

The list of links is 20 elements long


In [166]:
#print(links)
#len(links)

In [191]:

zipcode = '94605'
filter_ = 'sold_3yr'

url = construct_redfin_url(zipcode, filter_)
print(url)
soup = make_soup(url)

home_links = find_home_links(soup)

home_link = home_links[4]
print(home_link)
home_soup = parse_home_links(home_link)


https://www.redfin.com/zipcode/94605/filter/include=sold_3yr
Page is 456919 long.
/CA/Oakland/7724-Outlook-Ave-94605/home/2005543
Page is 429340 long.


In [213]:
#home_soup.find_all('Lot size')
facts_table = home_soup.find("div", class_="facts-table")
print(facts_table.prettify())
#table_labels = facts_table.find_all(class_="table-label")
table_row = facts_table.find_all(class_="table-row")

<div class="facts-table">
 <div class="table-row">
  <span class="table-label">
   Beds
  </span>
  <div class="table-value">
   3
  </div>
 </div>
 <div class="table-row">
  <span class="table-label">
   Baths
  </span>
  <div class="table-value">
   2
  </div>
 </div>
 <div class="table-row">
  <span class="table-label">
   Finished Sq. Ft.
  </span>
  <div class="table-value">
   1,577
  </div>
 </div>
 <div class="table-row">
  <span class="table-label">
   Unfinished Sq. Ft.
  </span>
  <div class="table-value">
   —
  </div>
 </div>
 <div class="table-row">
  <span class="table-label">
   Total Sq. Ft.
  </span>
  <div class="table-value">
   1,577
  </div>
 </div>
 <div class="table-row">
  <span class="table-label">
   Stories
  </span>
  <div class="table-value">
   2
  </div>
 </div>
 <div class="table-row">
  <span class="table-label">
   Lot Size
  </span>
  <div class="table-value">
   8,250 Sq. Ft.
  </div>
 </div>
 <div class="table-row">
  <span class="table-label">
   

In [214]:
label_list = []
for label in table_labels:
    print(label.get_text())

Beds
Baths
Finished Sq. Ft.
Unfinished Sq. Ft.
Total Sq. Ft.
Stories
Lot Size
Style
Year Built
Year Renovated
County
APN


In [215]:
for row in table_row:
    print(row.get_text())

Beds3
Baths2
Finished Sq. Ft.1,577
Unfinished Sq. Ft.—
Total Sq. Ft.1,577
Stories2
Lot Size8,250 Sq. Ft.
StyleSingle Family Residential
Year Built1939
Year Renovated1985
CountyAlameda County
APN040A341904500


In [196]:
type(facts_table)

bs4.element.ResultSet

In [198]:
facts_table.children

AttributeError: ResultSet object has no attribute 'children'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?

In [188]:
for child in home_soup.children:
    print(child)

html
<html lang="en"><head><!-- Server: phantom-6 --><!-- Time generated: Thu Apr 19 2018 17:00:35 GMT-0700 (PDT) --><title>3387 64Th Ave, Oakland, CA 94605 | MLS# 40817452 | Redfin</title><link data-react-server-css="" href="https://ssl.cdn-redfin.com/vLATEST/corvstatic/customer-pages/ADPDesktop-delayedBundlePage.345b6f3f4f4b193a8ed1.styles.css" media="" rel="stylesheet" type="text/css"/><link data-react-server-css="" href="https://ssl.cdn-redfin.com/vLATEST/corvstatic/customer-pages/long-cache.fe5628a944af2d73ca8a.styles.css" media="" rel="stylesheet" type="text/css"/><link data-react-server-css="" href="https://ssl.cdn-redfin.com/vLATEST/corvstatic/customer-pages/common.4c26cf4fb03dc608d9bf.styles.css" media="" rel="stylesheet" type="text/css"/><link data-react-server-css="" href="https://ssl.cdn-redfin.com/vLATEST/corvstatic/customer-pages/ADPDesktopPage.b1a9ac758991883de0a8.styles.css" media="" rel="stylesheet" type="text/css"/><link data-react-server-link="" href="/favicon.ico?v=

In [171]:
print(home_link)

/CA/Oakland/3387-64th-Ave-94605/home/12116626


In [172]:
home_links

['/CA/Oakland/4850-Dunkirk-Ave-94605/home/572756',
 '/CA/Oakland/6024-Old-Quarry-Loop-94605/home/12115899',
 '/CA/Oakland/8424-Golf-Links-Rd-94605/home/586033',
 '/CA/Oakland/7724-Outlook-Ave-94605/home/2005543',
 '/CA/Oakland/3387-64th-Ave-94605/home/12116626',
 '/CA/Oakland/2724-Seminary-Ave-94605/home/1341174',
 '/CA/Oakland/7941-Winthrope-St-94605/home/2000593',
 '/CA/Oakland/6707-Skyview-Dr-94605/home/144863881',
 '/CA/Oakland/3000-82nd-Ave-94605/home/1078361',
 '/CA/Oakland/520-Canyon-Oaks-Dr-94605/unit-B/home/12118507',
 '/CA/Oakland/9124-Thermal-St-94605/home/1564298',
 '/CA/Oakland/11155-Kerrigan-Dr-94605/home/1342115',
 '/CA/Oakland/7715-Greenly-Dr-94605/home/1289974',
 '/CA/Oakland/3539-Calandria-Ave-94605/home/110019259',
 '/CA/Oakland/4367-Short-Hill-Rd-94605/home/605377',
 '/CA/Oakland/2656-77th-Ave-94605/home/574046',
 '/CA/Oakland/6032-Old-Quarry-Loop-94605/home/12115901',
 '/CA/Oakland/2214-88th-Ave-94605/home/587532',
 '/CA/Oakland/2627-66th-Ave-94605/home/570699',
 '

In [179]:
type(soup.find_all("a")[0])

bs4.element.Tag

In [182]:
columns = ['SALE TYPE', 'SOLD DATE', 'PROPERTY TYPE', 'ADDRESS', 'CITY', 'STATE', 'ZIP',
           'PRICE', 'BEDS', 'BATHS', 'LOCATION', 'SQUARE FEET', 'LOT SIZE', 'YEAR BUILT',
           'DAYS ON MARKET', '$/SQUARE FEET', 'HOA/MONTH', 'STATUS', 'NEXT OPEN HOUSE START TIME',
           'NEXT OPEN HOUSE END TIME', 'URL', 'SOURCE', 'MLS#', 'FAVORITE', 'INTERESTED',
           'LATITUDE', 'LONGITUDE']

In [183]:
pd.Series(index=columns)

SALE TYPE                    NaN
SOLD DATE                    NaN
PROPERTY TYPE                NaN
ADDRESS                      NaN
CITY                         NaN
STATE                        NaN
ZIP                          NaN
PRICE                        NaN
BEDS                         NaN
BATHS                        NaN
LOCATION                     NaN
SQUARE FEET                  NaN
LOT SIZE                     NaN
YEAR BUILT                   NaN
DAYS ON MARKET               NaN
$/SQUARE FEET                NaN
HOA/MONTH                    NaN
STATUS                       NaN
NEXT OPEN HOUSE START TIME   NaN
NEXT OPEN HOUSE END TIME     NaN
URL                          NaN
SOURCE                       NaN
MLS#                         NaN
FAVORITE                     NaN
INTERESTED                   NaN
LATITUDE                     NaN
LONGITUDE                    NaN
dtype: float64

### Open downloaded CSV from Redfin to check its contents and use as a reference

In [37]:
!ls

LICENSE                        explore_luther.ipynb
README.md                      redfin_2018-04-18-08-51-54.csv


In [119]:
rf_csv = pd.read_csv('redfin_2018-04-18-08-51-54.csv')
rf_csv.columns

Index(['SALE TYPE', 'SOLD DATE', 'PROPERTY TYPE', 'ADDRESS', 'CITY', 'STATE',
       'ZIP', 'PRICE', 'BEDS', 'BATHS', 'LOCATION', 'SQUARE FEET', 'LOT SIZE',
       'YEAR BUILT', 'DAYS ON MARKET', '$/SQUARE FEET', 'HOA/MONTH', 'STATUS',
       'NEXT OPEN HOUSE START TIME', 'NEXT OPEN HOUSE END TIME',
       'URL (SEE http://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING)',
       'SOURCE', 'MLS#', 'FAVORITE', 'INTERESTED', 'LATITUDE', 'LONGITUDE'],
      dtype='object')

In [174]:
rf_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 27 columns):
SALE TYPE                                                                                     350 non-null object
SOLD DATE                                                                                     294 non-null object
PROPERTY TYPE                                                                                 350 non-null object
ADDRESS                                                                                       343 non-null object
CITY                                                                                          350 non-null object
STATE                                                                                         350 non-null object
ZIP                                                                                           350 non-null object
PRICE                                                                                         350

In [178]:
rf_csv.head()
list(rf_csv.columns)

['SALE TYPE',
 'SOLD DATE',
 'PROPERTY TYPE',
 'ADDRESS',
 'CITY',
 'STATE',
 'ZIP',
 'PRICE',
 'BEDS',
 'BATHS',
 'LOCATION',
 'SQUARE FEET',
 'LOT SIZE',
 'YEAR BUILT',
 'DAYS ON MARKET',
 '$/SQUARE FEET',
 'HOA/MONTH',
 'STATUS',
 'NEXT OPEN HOUSE START TIME',
 'NEXT OPEN HOUSE END TIME',
 'URL (SEE http://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING)',
 'SOURCE',
 'MLS#',
 'FAVORITE',
 'INTERESTED',
 'LATITUDE',
 'LONGITUDE']

In [184]:
rf_csv.head()

Unnamed: 0,SALE TYPE,SOLD DATE,PROPERTY TYPE,ADDRESS,CITY,STATE,ZIP,PRICE,BEDS,BATHS,LOCATION,SQUARE FEET,LOT SIZE,YEAR BUILT,DAYS ON MARKET,$/SQUARE FEET,HOA/MONTH,STATUS,NEXT OPEN HOUSE START TIME,NEXT OPEN HOUSE END TIME,URL (SEE http://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING),SOURCE,MLS#,FAVORITE,INTERESTED,LATITUDE,LONGITUDE
0,PAST SALE,April-4-2018,Single Family Residential,5514 Picardy Dr,Oakland,CA,94605,800000,4.0,2.0,Normandy Gardens,1650.0,3230.0,1926.0,14.0,485.0,,Sold,,,http://www.redfin.com/CA/Oakland/5514-Picardy-...,EBRD/CCAR/Bay East,40815087,N,Y,37.775837,-122.190547
1,PAST SALE,April-16-2018,Multi-Family (2-4 Unit),2812 76th Ave,Oakland,CA,94605,555000,4.0,3.0,Eastmont Area,2576.0,4068.0,1940.0,2.0,215.0,,Sold,,,http://www.redfin.com/CA/Oakland/2812-76th-Ave...,EBRD/CCAR/Bay East,40808499,N,Y,37.767668,-122.169261
2,PAST SALE,September-5-2017,Single Family Residential,4009 Malcolm Ave,Oakland,CA,94605,740000,3.0,2.5,Chabot Park,1948.0,9086.0,1960.0,225.0,380.0,,Sold,,,http://www.redfin.com/CA/Oakland/4009-Malcolm-...,EBRD/CCAR/Bay East,40787391,N,Y,37.750808,-122.134996
3,PAST SALE,July-1-2017,Single Family Residential,8301 Aster Ave,Oakland,CA,94605,760000,4.0,2.0,Oakland,2354.0,6962.0,1938.0,291.0,323.0,,Sold,,,http://www.redfin.com/CA/Oakland/8301-Aster-Av...,EBRD/CCAR/Bay East,40781777,N,Y,37.764025,-122.160524
4,PAST SALE,May-10-2017,Condo/Co-op,6144 Old Quarry Loop,Oakland,CA,94605,505000,2.0,2.0,East Oakl Hills,1079.0,,2007.0,343.0,468.0,282.0,Sold,,,http://www.redfin.com/CA/Oakland/6144-Old-Quar...,EBRD/CCAR/Bay East,40775089,N,Y,37.778864,-122.165009


## Learn BS4

In [74]:
soup2 = BeautifulSoup('<b class="boldest">Extremely bold</b>','lxml')
tag2 = soup2.b
type(tag)

bs4.element.Tag

In [75]:
tag2.name

'b'

In [76]:
tag2['class']

['boldest']

In [77]:
tag2.attrs

{'class': ['boldest']}

In [78]:
tag2.get('class')

['boldest']

In [79]:
tag2.string

'Extremely bold'