In [1]:
#Import the necessary modules
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
#Create the base url  
base_url = 'https://www.realtytrac.com/'
#Create the path that determines the area of study
la_search = 'mapsearch/sold/ca/los-angeles-county/'

In [71]:
#Grab the website content and store it in a BeautifulSoup object
response = requests.get(base_url+la_search)
soup = BeautifulSoup(response.content, 'html.parser')

In [4]:
houses = soup.find_all('div', class_="house alt clearfix")
print(f'The number of houses on each page is {len(houses)}')

The number of houses on each page is 10


In [6]:
#I'm going to find the url path for each house so we can find out more detail
links = []

for house in houses:
    links.append(base_url+house.find('a')['href'])
    
links

['https://www.realtytrac.com//property/ca/los-angeles/90016/4711-w-29th-st/154636126/',
 'https://www.realtytrac.com//property/ca/hacienda-heights/91745/15362-del-prado-dr/49086164/',
 'https://www.realtytrac.com//property/ca/cerritos/90703/16622-shenandoah-ave/4419219/',
 'https://www.realtytrac.com//property/ca/los-angeles/90071/633-w-5th-st/154657099/',
 'https://www.realtytrac.com//property/ca/west-covina/91792/3505-s-flemington-dr/155000757/',
 'https://www.realtytrac.com//property/ca/culver-city/90232/3913-bentley-ave/146537486/',
 'https://www.realtytrac.com//property/ca/burbank/91505/921-n-kenwood-st/154431994/',
 'https://www.realtytrac.com//property/ca/los-angeles/90026/1421-ewing-st/45927064/',
 'https://www.realtytrac.com//property/ca/north-hollywood/91601/11616-killion-st/148775631/',
 'https://www.realtytrac.com//property/ca/burbank/91506/1809-w-clark-ave/154425671/']

In [69]:
#Create the function that extracts the attributes for each house(link)

df = pd.DataFrame(columns=['Address', 'Property_type'])

def find_details(house):
    #Address
    address = house.find('span', class_='address heading').text
    
    #Property Details
    details = house.find('ul', class_='detail-list')
    add_details = details.find_all('li', itemprop='additionalProperty' )
    
    #Residential Bilding Types
    prop_type = details.find('span', itemprop='name').text
    
    #Rooms
    rooms = details.find('span', itemprop='description').text
    
    #Home Size
    home_size = add_details[0].find('span', itemprop='value').text
    home_unit = add_details[0].find('span', itemprop='unitText').text
    
    #Lot Size
    lot_size = add_details[1].find('span', itemprop='value').text
    lot_unit = add_details[1].find('span', itemprop='unitText').text
    
    #Year Built
    year_built = add_details[2].find('span', itemprop='value').text
    
    #Parcel Number
    parcel = add_details[3].find('span', itemprop='propertyID').text
    
    #RealtyTracPropteryID
    realty = add_details[4].find('span', itemprop='value').text
    
    #County
    county = add_details[5].find('span', itemprop='description').text
    
    #Subdivision
    sub = add_details[6].find('span', itemprop='value').text
    
    #Census
    census = add_details[7].find('span', itemprop='value').text
    
    #Tract
    tract = add_details[8].find('span', itemprop='value').text
    
    #Lot
    lot = add_details[9].find('span', itemprop='value').text
    
    #Zoning
    zoning = details.find_all('li')[12].text
    
    #Create the df to be returned
    df = pd.DataFrame({
                       'Address':[address],
                       'Property_type': [prop_type],
                       'Rooms': [rooms],
                       'Home_size': [home_size+home_unit],
                       'Lot Size': [lot_size+lot_unit],
                       'Year_built': [year_built],
                       'Parcel_number': [parcel],
                       'RealtyID': [realty],
                       'County': [county],
                       'Subdivision': [sub],
                       'Census': [census],
                       'Tract': [tract],
                       'Lot': [lot],
                       'Zoning': [zoning]
                   
                      })
    
    return df  

In [58]:
#Loop through the houses and grab all of the data

for i in range(len(links)):
    house_detail = requests.get(links[i])
    soup = BeautifulSoup(house_detail.content, 'html.parser')
    df = df.append(find_details(soup), ignore_index=True)

df

Unnamed: 0,Address,Property_type,Rooms,Home_size,Lot Size
0,4711 W 29th St,Single Family Residence,"2 beds, 1 bath","1,132 sqft","2,750 sqft"
1,15362 Del Prado Dr,Single Family Residence,"4 beds, 3 baths","2,462 sqft","10,250 sqft"
2,16622 Shenandoah Ave,Condominium,"2 beds, 2 baths",904 sqft,"7,007 sqft"
3,633 W 5th St,Office Building,Built in,30.9 acres,1.17 acres
4,3505 S Flemington Dr,Single Family Residence,"3 beds, 2 baths","1,272 sqft","6,734 sqft"
5,3913 Bentley Ave # 4,Condominium,"3 beds, 3 baths","2,120 sqft","7,505 sqft"
6,921 N Kenwood St,Single Family Residence,"2 beds, 1 bath","1,042 sqft","6,750 sqft"
7,1421 Ewing St,Single Family Residence,"1 bed, 1 bath",736 sqft,"7,370 sqft"
8,11616 Killion St,Single Family Residence,"3 beds, 1 bath",996 sqft,"6,209 sqft"
9,1809 W Clark Ave,Single Family Residence,"2 beds, 1 bath","1,000 sqft","6,892 sqft"
