# Scraping Craigslist to get House Prices in LA

## Web Scraping

In [1]:
#import get to call a get request on the site
from requests import get

#get the first page of the east bay housing prices
response = get('https://losangeles.craigslist.org/search/apa?hasPic=1&min_bedrooms=2&availabilityMode=0&sale_date=all+dates') #get rid of those lame-o's that post a housing option without a pic using their filter

from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser')

#get the macro-container for the housing posts
posts = html_soup.find_all('li', class_= 'result-row')
print(type(posts)) #to double check that I got a ResultSet
print(len(posts)) #to double check I got 120 (elements/page)

<class 'bs4.element.ResultSet'>
120


In [2]:
#grab the first post
post_one = posts[0]

#If we want, we can print the entire scraped text for the first post
#print(post_one.prettify())

#grab the price of the first post
post_one_price = post_one.a.text
post_one_price.strip()

'$2,600'

In [3]:
#grab the time of the post in datetime format to save on cleaning efforts
post_one_time = post_one.find('time', class_= 'result-date')
post_one_datetime = post_one_time['datetime']
post_one_datetime

'2021-12-14 19:44'

In [4]:
#title is a and that class, link is grabbing the href attribute of that variable
post_one_title = post_one.find('a', class_='result-title hdrlnk')
post_one_link = post_one_title['href']

#easy to grab the post title by taking the text element of the title variable
post_one_title_text = post_one_title.text

In [5]:
#title is a and that class, link is grabbing the href attribute of that variable
post_one_title = post_one.find('a', class_='result-title hdrlnk')
post_one_link = post_one_title['href']

#easy to grab the post title by taking the text element of the title variable
post_one_title_text = post_one_title.text

In [6]:
#grabs the whole segment of housing details. We will need missing value handling in the loop as this kind of detail is not common in posts
#the text can be split, and we can use indexing to grab the elements we want. number of bedrooms is the first element.
#sqft is the third element

post_one_bed_sqft_info = post_one.find('span', class_ = 'housing').text.split()

post_one_num_bedrooms = post_one_bed_sqft_info[0]

if len(post_one_bed_sqft_info) > 2:
    post_one_s = post_one.find('span', class_ = 'housing').text.split()[2][:-3] #cleans the ft2 at the end


post_one_hood = posts[0].find('span', class_='result-hood').text #grabs the neighborhood, this is the problem column that requires
#a lot of cleaning and figuring out later.

In [7]:
#build out the loop
from time import sleep
import re
from random import randint #avoid throttling by not sending too many requests one after the other
from warnings import warn
from time import time
from IPython.core.display import clear_output
import numpy as np

#find the total number of posts to find the limit of the pagination
results_num = html_soup.find('div', class_= 'search-legend')
results_total = int(results_num.find('span', class_='totalcount').text) #pulled the total count of posts as the upper bound of the pages array

#each page has 119 posts so each new page is defined as follows: s=120, s=240, s=360, and so on. So we need to step in size 120 in the np.arange function
pages = np.arange(0, results_total+1, 120)

iterations = 0

post_timing = []
post_hoods = []
post_title_texts = []
bedroom_counts = []
sqfts = []
post_links = []
post_prices = []

for page in pages:
    
    #get request
    response = get("https://losangeles.craigslist.org/search/apa?" 
                   + "s=" #the parameter for defining the page number 
                   + str(page) #the page number in the pages array from earlier
                   + "&hasPic=1"
                   + "&min_bedrooms=2"
                   + "&availabilityMode=0")

    sleep(randint(1,5))
     
    #throw warning for status codes that are not 200
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(requests, response.status_code))
        
    #define the html text
    page_html = BeautifulSoup(response.text, 'html.parser')
    
    #define the posts
    posts = html_soup.find_all('li', class_= 'result-row')
        
    #extract data item-wise
    for post in posts:

        if post.find('span', class_ = 'result-hood') is not None:

            #posting date
            #grab the datetime element 0 for date and 1 for time
            post_datetime = post.find('time', class_= 'result-date')['datetime']
            post_timing.append(post_datetime)

            #neighborhoods
            post_hood = post.find('span', class_= 'result-hood').text
            post_hoods.append(post_hood)

            #title text
            post_title = post.find('a', class_='result-title hdrlnk')
            post_title_text = post_title.text
            post_title_texts.append(post_title_text)

            #post link
            post_link = post_title['href']
            post_links.append(post_link)
            
            #removes the \n whitespace from each side, removes the currency symbol, and turns it into an int
            #print(post.a.text.strip())
            post_price = int(post.a.text.strip().replace("$", "").replace(",","")) 
            post_prices.append(post_price)
            
            if post.find('span', class_ = 'housing') is not None:
                
                #if the first element is accidentally square footage
                if 'ft2' in post.find('span', class_ = 'housing').text.split()[0]:
                    
                    #make bedroom nan
                    bedroom_count = np.nan
                    bedroom_counts.append(bedroom_count)
                    
                    #make sqft the first element
                    sqft = int(post.find('span', class_ = 'housing').text.split()[0][:-3])
                    sqfts.append(sqft)
                    
                #if the length of the housing details element is more than 2
                elif len(post.find('span', class_ = 'housing').text.split()) > 2:
                    
                    #therefore element 0 will be bedroom count
                    bedroom_count = post.find('span', class_ = 'housing').text.replace("br", "").split()[0]
                    bedroom_counts.append(bedroom_count)
                    
                    #and sqft will be number 3, so set these here and append
                    sqft = int(post.find('span', class_ = 'housing').text.split()[2][:-3])
                    sqfts.append(sqft)
                    
                #if there is num bedrooms but no sqft
                elif len(post.find('span', class_ = 'housing').text.split()) == 2:
                    
                    #therefore element 0 will be bedroom count
                    bedroom_count = post.find('span', class_ = 'housing').text.replace("br", "").split()[0]
                    bedroom_counts.append(bedroom_count)
                    
                    #and sqft will be number 3, so set these here and append
                    sqft = np.nan
                    sqfts.append(sqft)                    
                
                else:
                    bedroom_count = np.nan
                    bedroom_counts.append(bedroom_count)
                
                    sqft = np.nan
                    sqfts.append(sqft)
                
            #if none of those conditions catch, make bedroom nan, this won't be needed    
            else:
                bedroom_count = np.nan
                bedroom_counts.append(bedroom_count)
                
                sqft = np.nan
                sqfts.append(sqft)
            #    bedroom_counts.append(bedroom_count)
                
            #    sqft = np.nan
            #    sqfts.append(sqft)
                
    iterations += 1
    print("Page " + str(iterations) + " scraped successfully!")

print("\n")

print("Scrape complete!")

Page 1 scraped successfully!
Page 2 scraped successfully!
Page 3 scraped successfully!
Page 4 scraped successfully!
Page 5 scraped successfully!
Page 6 scraped successfully!
Page 7 scraped successfully!
Page 8 scraped successfully!
Page 9 scraped successfully!
Page 10 scraped successfully!
Page 11 scraped successfully!
Page 12 scraped successfully!
Page 13 scraped successfully!
Page 14 scraped successfully!
Page 15 scraped successfully!
Page 16 scraped successfully!
Page 17 scraped successfully!
Page 18 scraped successfully!
Page 19 scraped successfully!
Page 20 scraped successfully!
Page 21 scraped successfully!
Page 22 scraped successfully!
Page 23 scraped successfully!
Page 24 scraped successfully!
Page 25 scraped successfully!
Page 26 scraped successfully!


Scrape complete!


In [8]:
import pandas as pd

eb_apts = pd.DataFrame({'posted': post_timing,
                       'neighborhood': post_hoods,
                       'post title': post_title_texts,
                       'number bedrooms': bedroom_counts,
                        'sqft': sqfts,
                        'URL': post_links,
                       'price': post_prices})
print(eb_apts.info())
eb_apts.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3120 entries, 0 to 3119
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   posted           3120 non-null   object 
 1   neighborhood     3120 non-null   object 
 2   post title       3120 non-null   object 
 3   number bedrooms  3120 non-null   object 
 4   sqft             2808 non-null   float64
 5   URL              3120 non-null   object 
 6   price            3120 non-null   int64  
dtypes: float64(1), int64(1), object(5)
memory usage: 170.8+ KB
None


Unnamed: 0,posted,neighborhood,post title,number bedrooms,sqft,URL,price
0,2021-12-14 19:44,( central LA 213/323 ),2+2 in N. Westlake -Wood Style Floors -Fitness...,2,975.0,https://losangeles.craigslist.org/lac/apa/d/lo...,2600
1,2021-12-14 19:43,( san gabriel valley ),"Courtyard w/ Jacuzzi, 2 Bedroom 2 Bath in Alha...",2,767.0,https://losangeles.craigslist.org/sgv/apa/d/al...,2058
2,2021-12-14 19:43,(Culver City westside-southbay-310 ),Newer 2 Bedroom in Culver City | Nest Thermost...,2,1365.0,https://losangeles.craigslist.org/wst/apa/d/lo...,3850
3,2021-12-14 19:42,( central LA 213/323 ),"Spacious 2 Bedroom in The Miracle Mile, Quartz...",2,900.0,https://losangeles.craigslist.org/lac/apa/d/lo...,2895
4,2021-12-14 19:42,"(21041 Parthenia Street, Los Angeles, CA san ...","Pool, Covered Parking, Fitness Center",2,1070.0,https://losangeles.craigslist.org/sfv/apa/d/wi...,2295
5,2021-12-14 19:42,( long beach / 562 ),"2 bedroom, 18-foot ceilings Select units, Cour...",2,971.0,https://losangeles.craigslist.org/lgb/apa/d/lo...,2792
6,2021-12-14 19:40,(Santa Monica/Near the beach westside-southba...,Recessed Lighting + W/D + 2 Bedroom 2 BA in Sa...,2,825.0,https://losangeles.craigslist.org/wst/apa/d/sa...,3695
7,2021-12-14 19:38,(Los Angeles central LA 213/323 ),Splendid 2br-2ba Front House,2,1000.0,https://losangeles.craigslist.org/lac/apa/d/lo...,2765
8,2021-12-14 19:38,(Echo Park central LA 213/323 ),"Large 2 bedroom, 1 bath in historic Angelino H...",2,1100.0,https://losangeles.craigslist.org/lac/apa/d/lo...,2800
9,2021-12-14 19:38,(West Los Angeles westside-southbay-310 ),"3 Bedroom TH Style in West Los Angeles, Patio/...",3,1745.0,https://losangeles.craigslist.org/wst/apa/d/lo...,3795
