In [1]:
#Import the necessary packages
import requests
from requests_html import AsyncHTMLSession
from bs4 import BeautifulSoup
import pandas as pd
import warnings 
from details import agt, cook

warnings.filterwarnings('ignore')

In [2]:
#Create the base url  
base_url = 'https://www.realtytrac.com/'

#Create a search for the LA area
la_search = 'mapsearch/sold/ca/los-angeles-county/'

In [3]:
#Grab the website content and store it in a BeautifulSoup object
response = requests.get(base_url+la_search, headers=agt, cookies=cook)
soup = BeautifulSoup(response.content, 'html.parser')

In [4]:
houses = soup.find_all('div', class_="house alt clearfix")

#Number of pages to run the scraper through
num_pages = 200

print(f'You\'ll scrape {len(houses)*num_pages} houses')

You'll scrape 2000 houses


In [5]:
absolute_links = []

for i in range(num_pages):
    page = soup.find_all('div', class_="house alt clearfix")
    
    for house in page:
        absolute_links.append(base_url+house.find('a')['href'])
    
    next_link = soup.find('a', class_='next')['href']
    response = requests.get(base_url+next_link, headers=agt, cookies=cook)
    soup = BeautifulSoup(response.content, 'html.parser')
    
len(absolute_links)

2000

In [6]:
#Create the function that extracts the attributes for each house(link)

#details_df = pd.DataFrame()

def find_details(house):
    
    
    try:
        lat = house.find('meta', property='og:latitude')['content']
    except:
        lat=None
    try:
        long = house.find('meta', property='og:longitude')['content'] 
    except:
        long=None
    try:    
        address = house.find('span', class_='address heading').text 
    except:
        address=None
    
    #Property Details
    try:
        details = house.find('ul', class_='detail-list')
    except:
        details = None
    try:
        add_details = details.find_all('li')
    except:
        add_details = None
    
    try:
        prop_type = add_details[0].find('span', itemprop='name').text
    except:
        prop_type=None
    try:
        rooms = add_details[1].find('span', itemprop='description').text
    except:
        rooms=None
    try:
        home_size_p = details.find('span', itemprop='name', text='Home Size').parent
        home_size = home_size_p.find('span', itemprop='value').text
    except:
        home_size=''
    try:                                                              
        home_unit = home_size_p.find('span', itemprop='unitText').text
    except:
        home_unit=''
    try:
        lot_size_p = details.find('span', itemprop='name', text='Lot Size').parent
        lot_size = lot_size_p.find('span', itemprop='value').text
    except:
        lot_size=''
    try:                                                             
        lot_unit = lot_size_p.find('span', itemprop='unitText').text
    except:
        lot_unit=''
    try:
        year_built_p = details.find('span', itemprop='description', text='Built in').parent
        year_built = year_built_p.find('span', itemprop='value').text
    except:
        year_built=None
    try:
        parcel_p = details.find('span', itemprop='name', text='Parcel Number').parent
        parcel = parcel_p.find('span', itemprop='propertyID').text
    except:
        parcel=None
    try:
        realty_p = details.find('span', itemprop='name', text='RealtyTrac Property ID').parent
        realty = realty_p.find('span', itemprop='value').text
    except:
        realty=None
    try:
        county_p = details.find('span', itemprop='name', text='County').parent
        county = county_p.find('span', itemprop='description').text
    except:
        county=None
    try:
        sub_p = details.find('span', itemprop='name', text='Subdivision').parent
        sub = sub_p.find('span', itemprop='value').text
    except:
        sub=None
    try:
        census_p = details.find('span', itemprop='name', text='Census').parent
        census = census_p.find('span', itemprop='value').text
    except:
        census=None
    try:
        tract_p = details.find('span', itemprop='name', text='Tract').parent
        tract = tract_p.find('span', itemprop='value').text
    except:
        tract=None
    try:
        zoning = details.find_all('li')[-2].text
    except:
        zoning=None
    try:                                                             
        lot_p = details.find('span', itemprop='name', text='Lot').parent
        lot = lot_p.find('span', itemprop='value').text
    except:
        lot = None
    
    
    #Sales History
    try:
        sales = house.find('section', id='occupancy-link') 
    except:
        sales = None
    
    try:                                                              
        sale_price = sales.find('label', attrs={'for':'PurchasePrice'}).parent.parent.find('td', class_='col2').text            
    except:
        sale_price=None
    try:
        date = sales.find('label', attrs={'for':'PurchaseDate'}).parent.parent.find('td', class_='col2').text
    except:
        date=None
    
    #Estimated Value
    try:
        est = house.find('div', class_='rate-row access')
    except:
        est = None
    
    try:                                                              
        estimate = est.find('strong', class_='price').text
    except:
        estimate= None
    
    #Home Disclosures
    try:
        disc = house.find('div', id="collapsePropertyHomeDisclosuresArea")
    except:
        disc = None
    
    try:
        sex_offenders = disc.find('span', class_='hd-title', text='Sex Offenders').parent.find('span', class_='hd-label').text
    except:
        sex_offenders = None
    
    try:
        crime_index = disc.find('span', class_='hd-title',text='Crime Index').parent.find('span', class_='hd-label').text
    except:
        crime_index = None
    
    try:
        env_hazards = disc.find('span', class_='hd-title',text='Environmental Hazards').parent.find('span', class_='hd-label').text
    except:
        env_hazards = None
    
    try:
        natural_disasters = disc.find('span', class_='hd-title', text = 'Natural Disasters').parent.find('span', class_='hd-label').text
    except:
        natural_disasters = None
    try:
        school_index = disc.find('span', class_='hd-title', text = 'Local Schools').parent.find('span', class_='hd-label').text
    except:
        school_index = None
    
    try:
        url = house.find('meta', property='og:url')['content']
    except:
        url=None
    #Create the df to be returned
    details_df = pd.DataFrame({
        
                       'latitude': [lat],
                       'longitude': [long],
                       'address':[address],
                       'property_type': [prop_type],
                       'rooms': [rooms],
                       'home_size': [home_size+home_unit],
                       'lot_size': [lot_size+lot_unit],
                       'year_built': [year_built],
                       'parcel_number': [parcel],
                       'realtyID': [realty],
                       'county': [county],
                       'subdivision': [sub],
                       'census': [census],
                       'tract': [tract],
                       'lot': [lot],
                       'zoning': [zoning],
                       'date': [date],
                       'sale_price': [sale_price],
                       'estimated_value': [estimate],
                       'sex_offenders': [sex_offenders],
                       'crime_index': [crime_index],
                       'enviornmental_hazards': [env_hazards],
                       'natural_disasters': [natural_disasters],
                       'school_quality': [school_index],
                       'url': [url]
                      })
    
    return details_df

In [7]:
#Create a function that captures sales history data

def find_sales_history(house):
    
    temp = pd.DataFrame()
    
    try:
        details = house.find('ul', class_='detail-list')
    except:
        details = None
   
    try:
        parcel_p = details.find('span', itemprop='name', text='Parcel Number').parent
        parcel = parcel_p.find('span', itemprop='propertyID').text
    except:
        parcel=None
    
    try:
        table = house.find('table', class_='sales-table history-table').find('tbody')
    except:
        table = None
    
    try:
        sales = table.find_all('tr')
    except:
        sales = None
    
    if sales !=None:
        
        for i in range(len(sales)):
            
            try:
                date = sales[i].find('td', class_='col-1').text
            except:
                date = None
        
            try:
                price = sales[i].find('td', class_='col-3').text
            except:
                price = None
            
            try:
                price_sqft = sales[i].find('td', class_='col-2').text
            except:
                price_sqft = None
            
        
            details_df = pd.DataFrame({
                
                    'parcel_number': [parcel],
                    'date': [date],
                    'price': [price]
                    #'price_sqft': [price_sqft]
                   
                      })
            
            temp = temp.append(details_df, ignore_index=True)
            
      
    return temp

In [8]:
#Spot check 
#df = pd.DataFrame()
#asession = AsyncHTMLSession()
#r = await asession.get(absolute_links[16])
#await r.html.arender()
#response = r.html.raw_html
#soup = BeautifulSoup(response, 'html.parser')
#df = df.append(find_sales_history(soup), ignore_index=True)
#df

In [9]:
with open("links.txt", "w") as f:
    for s in absolute_links:
        f.write(str(s) +"\n")

In [10]:
#Opening up the links file and saving it as a list
absolute_links = []

with open("links.txt", "r") as f:
    for line in f:
        absolute_links.append(line.strip())
        
len(absolute_links)

2000

In [11]:
#Loop through all the houses captured in absolute_links
details = pd.DataFrame()
sales_history  = pd.DataFrame()
asession = AsyncHTMLSession()


for i in range(len(absolute_links)):
    r = await asession.get(absolute_links[i])
    await r.html.arender()
    resp= r.html.raw_html
    soup = BeautifulSoup(resp, 'html.parser')
    details = details.append(find_details(soup), ignore_index=True)
    sales_history = sales_history.append(find_sales_history(soup), ignore_index=True)

    if (i+1)%25 == 0:
        details.to_csv('Data/house_data_details_scraped.csv', index=False)
        sales_history.to_csv('Data/house_data_saleshistory_scraped.csv', index=False)
        print(f'Scraped {details.shape[0]} properties')
        asession.close()
        asession = AsyncHTMLSession()

Scraped 25 properties
Scraped 50 properties
Scraped 75 properties
Scraped 100 properties
Scraped 125 properties
Scraped 150 properties
Scraped 175 properties
Scraped 200 properties
Scraped 225 properties
Scraped 250 properties
Scraped 275 properties
Scraped 300 properties
Scraped 325 properties
Scraped 350 properties
Scraped 375 properties
Scraped 400 properties
Scraped 425 properties
Scraped 450 properties
Scraped 475 properties
Scraped 500 properties
Scraped 525 properties
Scraped 550 properties
Scraped 575 properties
Scraped 600 properties
Scraped 625 properties
Scraped 650 properties
Scraped 675 properties
Scraped 700 properties
Scraped 725 properties
Scraped 750 properties
Scraped 775 properties
Scraped 800 properties
Scraped 825 properties
Scraped 850 properties
Scraped 875 properties
Scraped 900 properties
Scraped 925 properties
Scraped 950 properties
Scraped 975 properties
Scraped 1000 properties
Scraped 1025 properties
Scraped 1050 properties
Scraped 1075 properties
Scraped 11

In [13]:
#Save the dataframe to a csv file
details.to_csv('Data/house_data_details_scraped.csv', index=False)
sales_history.to_csv('Data/house_data_saleshistory_scraped.csv', index=False)
details.head()

Unnamed: 0,latitude,longitude,address,property_type,rooms,home_size,lot_size,year_built,parcel_number,realtyID,...,zoning,date,sale_price,estimated_value,sex_offenders,crime_index,enviornmental_hazards,natural_disasters,school_quality,url
0,34.004865,-118.312134,1821 W 43rd Pl,Single Family Residence,"2 beds, 1 bath","1,322 sqft","5,750 sqft",1915,5022007005,1113736589,...,\nZoning:\n\nLAR1\n\t\t,12/14/2020,"$743,000","$667,100",80 Found,Slightly High,7 Found,2 Found,Poor,https://www.realtytrac.com/property/ca/los-ang...
1,34.208532,-118.228494,2315 Mira Vista Ave # 101,Condominium,"3 beds, 3 baths","1,460 sqft","9,003 sqft",2005,5807006114,1113736606,...,\nZoning:\n\nLCR3YY\n\t\t,12/14/2020,"$690,000","$691,500",0 Found,,5 Found,0 Found,Excellent,https://www.realtytrac.com/property/ca/montros...
2,34.177544,-118.515557,5864 Texhoma Ave,Single Family Residence,"3 beds, 2 baths","1,444 sqft","5,500 sqft",1949,2254005028,1113737946,...,\nZoning:\n\nLAR1\n\t\t,12/14/2020,"$680,000","$556,300",3 Found,,5 Found,1 Found,Average,https://www.realtytrac.com/property/ca/encino/...
3,34.171504,-118.22724,1635 N Verdugo Rd # A,Condominium,"2 beds, 3 baths","1,604 sqft","10,359 sqft",1978,5652006035,1113737947,...,\nZoning:\n\nGLR4-L*\n\t\t,12/14/2020,"$666,000","$735,300",0 Found,Low,5 Found,1 Found,Excellent,https://www.realtytrac.com/property/ca/glendal...
4,34.01606,-118.174554,1309 S Eastern Ave,"Warehouse, Storage",,"4,900 sqft","11,681 sqft",1955,5241012012,1113738480,...,\nZoning:\n\nCMM1*\n\t\t,12/14/2020,"$1,150,000",,10 Found,High,18 Found,1 Found,Average,https://www.realtytrac.com/property/ca/commerc...
