https://towardsdatascience.com/web-scraping-craigslist-a-complete-tutorial-c41cea4f4981

In [205]:
from requests import get
from bs4 import BeautifulSoup

from selenium import webdriver
from time import sleep
import re
from random import randint #avoid throttling by not sending too many requests one after the other
from warnings import warn
from time import time
from IPython.core.display import clear_output
import numpy as np
import pandas as pd

In [14]:
# import get to call a get request on the site

region = "raleigh"
pageurl = f'https://{region}.craigslist.org/search/apa?hasPic=1&availabilityMode=0'
# get the first page of the east bay housing prices
# get rid of those lame-o's that post a housing option without a pic using their filter
response = get(pageurl)

html_soup = BeautifulSoup(response.text, 'html.parser')

# get the macro-container for the housing posts
posts = html_soup.find_all('li', class_='result-row')
print(type(posts))  # to double check that I got a ResultSet
print(len(posts))  # to double check I got 120 (elements/page)


<class 'bs4.element.ResultSet'>
120


In [15]:
first_post = posts[0]

In [16]:
first_post.a.text.strip()

'$1,319'

In [17]:
first_post.find("time", class_="result-date")['datetime']

'2022-02-12 20:21'

In [18]:
first_post.find('a', class_='result-title hdrlnk').text

'Spacious Floorplans, Close to Restaurants, BBQ/Picnic Area'

In [19]:
first_post.find('a', class_='result-title hdrlnk')['href']

'https://raleigh.craigslist.org/apa/d/raleigh-spacious-floorplans-close-to/7445351524.html'

In [199]:
post_hood = re.sub(" \(|  \)", "", post.find('span', class_= 'result-hood').text)


In [187]:
data_id = first_post.find('a', class_='result-title hdrlnk')['data-id']
data_id

'7445351524'

In [170]:
content_raw  = first_post.find_all("span", class_="housing")[0].text

content_sub_space = re.sub(' +', ' ',content_raw)
content_sub_space

'\n 1br -\n 833ft2 -\n '

In [204]:
def br_ft2_from_housing(post):
    from numpy import nan

    ft2 = nan
    br = nan
    
    content_raw  = post.find_all("span", class_="housing")[0].text
    content_sub_space = re.sub(' +', ' ',content_raw)
    pieces = re.split(' -|; |, |\*|\n ',content_sub_space)
    for piece in pieces:
        if len(piece)>0:
            ft2_list = re.split('ft2', piece)
            if len(ft2_list)>1: #  re.strip is completed for splitting the unit
                ft2 = int(ft2_list[0])
            br_list = re.split('br',piece)
            if len(br_list)>1: #  re.strip is completed for splitting the unit
                br = int(br_list[0])

    return br, ft2


In [167]:
re.split("ft2","833br")
if re
#float([0])

['833br']

In [209]:
from numpy import nan

def br_ft2_from_housing(post):
    

    ft2 = nan
    br = nan
    if post.find('span', class_ = 'housing') is not None:  # else return nan directly
        content_raw  = post.find_all("span", class_="housing")[0].text
        content_sub_space = re.sub(' +', ' ',content_raw)
        pieces = re.split(' -|; |, |\*|\n ',content_sub_space)
        for piece in pieces:
            if len(piece)>0:
                ft2_list = re.split('ft2', piece)
                if len(ft2_list)>1: #  re.strip is completed for splitting the unit
                    ft2 = int(ft2_list[0])
                br_list = re.split('br',piece)
                if len(br_list)>1: #  re.strip is completed for splitting the unit
                    br = int(br_list[0])

    return br, ft2, content_sub_space

#find the total number of posts to find the limit of the pagination
results_num = html_soup.find('div', class_= 'search-legend')
results_total = int(results_num.find('span', class_='totalcount').text) #pulled the total count of posts as the upper bound of the pages array

#each page has 119 posts so each new page is defined as follows: s=120, s=240, s=360, and so on. So we need to step in size 120 in the np.arange function
pages = np.arange(0, results_total+1, 120)

iterations = 0

post_timing = [] #
post_title_texts = []  #
post_links = []  #
post_prices = []  #

post_hoods = []  #
bedroom_counts = []  
sqfts = []
br_ft2_content = []

#for page in pages:
page=pages[0]
#get request
response = get(f"https://{region}.craigslist.org/search/apt?" 
               + f"s={page}" #the parameter for defining the page number 
               + "&hasPic=1"
               + "&availabilityMode=0")
#throw warning for status codes that are not 200
if response.status_code != 200:
    warn('Request: {}; Status code: {}'.format(requests, response.status_code))

posts = html_soup.find_all('li', class_='result-row')
for post in posts:
    post_prices.append(float(post.a.text.strip().replace("$", "").replace(",","")))
    post_timing.append(post.find("time", class_="result-date")['datetime'])
    post_title_texts.append(post.find('a', class_='result-title hdrlnk').text)
    post_links.append(post.find('a', class_='result-title hdrlnk')['href'])
    post_hoods.append(re.sub(" \(|  \)", "", post.find('span', class_= 'result-hood').text))
    
    br_temp, ft2_temp, content_str = br_ft2_from_housing(post)
    bedroom_counts.append(br_temp)
    sqfts.append(ft2_temp)
    br_ft2_content.append(content_str)

sleep(randint(1,10))

In [211]:
df = pd.DataFrame()
df['price_usd'] = post_prices
df['datetime'] = post_timing
df['title'] = post_title_texts
df['url'] = post_links
df['neighborhood'] = post_hoods
df['num_bedrooms'] = bedroom_counts
df['sqfts'] = sqfts
df['br_ft2_content'] = br_ft2_content
df.head(20)

Unnamed: 0,price_usd,datetime,title,url,neighborhood,num_bedrooms,sqfts,br_ft2_content
0,1319.0,2022-02-12 20:21,"Spacious Floorplans, Close to Restaurants, BBQ...",https://raleigh.craigslist.org/apa/d/raleigh-s...,"1910 Capital Creek Drive , Wake Forest, NC",1,833.0,\n 1br -\n 833ft2 -\n
1,1410.0,2022-02-12 19:56,"Pantry, Business Center, Custom Cabinetry",https://raleigh.craigslist.org/apa/d/raleigh-p...,"1910 Capital Creek Drive , Wake Forest, NC",1,859.0,\n 1br -\n 859ft2 -\n
2,2008.0,2022-02-12 19:34,"Custom Cabinetry, Walk-In Closet, Dual Vanity ...",https://raleigh.craigslist.org/apa/d/raleigh-c...,"1910 Capital Creek Drive , Wake Forest, NC",3,1486.0,\n 3br -\n 1486ft2 -\n
3,1300.0,2022-02-12 19:31,"Courtesy Patrol Officer, Black Appliance Packa...",https://raleigh.craigslist.org/apa/d/chapel-hi...,"2701 Homestead Rd, Chapel Hill, NC",2,932.0,\n 2br -\n 932ft2 -\n
4,1599.0,2022-02-12 19:23,"Pet Friendly. See Pet Rules., Online Rental Pa...",https://raleigh.craigslist.org/apa/d/wake-fore...,"1910 Capital Creek Dr, Wake Forest, NC",1,970.0,\n 1br -\n 970ft2 -\n
5,1662.0,2022-02-12 18:49,"Microwave, Disability Access, On-Site Management",https://raleigh.craigslist.org/apa/d/raleigh-m...,"1910 Capital Creek Drive , Wake Forest, NC",2,1195.0,\n 2br -\n 1195ft2 -\n
6,3200.0,2022-02-12 18:37,Furnished Summer Rental walk to North Hills,https://raleigh.craigslist.org/apa/d/raleigh-f...,North Hills,3,2000.0,\n 3br -\n 2000ft2 -\n
7,1360.0,2022-02-12 18:27,"Online Service Requests, Package Receiving, Cl...",https://raleigh.craigslist.org/apa/d/raleigh-o...,,2,1076.0,\n 2br -\n 1076ft2 -\n
8,1720.0,2022-02-12 18:21,"Close to Freeway, Ceiling Fan, Breakfast Bar",https://raleigh.craigslist.org/apa/d/raleigh-c...,"1910 Capital Creek Drive , Wake Forest, NC",2,1308.0,\n 2br -\n 1308ft2 -\n
9,750.0,2022-02-12 18:02,Lovely single home for rent,https://raleigh.craigslist.org/apa/d/durham-lo...,"505 Bellmeade Bay Dr, Durham, NC",3,1805.0,\n 3br -\n 1805ft2 -\n


In [118]:
len(post_prices)

120

In [117]:
len(post_timing)

120

In [116]:
len(post_title_texts)

120

In [23]:
pages 

array([   0,  120,  240,  360,  480,  600,  720,  840,  960, 1080, 1200,
       1320, 1440, 1560, 1680, 1800, 1920, 2040, 2160, 2280, 2400, 2520,
       2640, 2760, 2880, 3000])

In [10]:
# https://towardsdatascience.com/data-science-skills-web-scraping-javascript-using-python-97a29738353f

# driver = webdriver.Firefox()
# driver.get(pageurl)

In [111]:
randint(1,10)

7