In [1]:
# Import required libraries
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# URL for Streeteasy with a search query of sales in Murray Hill, two bedrooms, less than $2,000,000 USD.
streeteasy_url = 'https://streeteasy.com/2-bedroom-apartments-for-sale/murray-hill/price:-2000000'
# Create list of pages to scrape.
pages = [page for page in range(2,4)]
page_urls = [streeteasy_url]

In [3]:
# Create list of columns for dataframe.
streeteasy_df_columns = ['Title', 'Address', 'Price', 'Bedrooms', 'Bathrooms', 'Square Footage']

#Create lists to store all of the title, address, and price information
streeteasy_titles = []
streeteasy_addresses = []
streeteasy_prices = []
streeteasy_bedrooms = []
streeteasy_bathrooms = []
streeteasy_sq_footage = []

# Create dictionary to store information.
# Dictionary will be converted to a pandas dataframe.

streeteasy_dict ={
    streeteasy_df_columns[0]: streeteasy_titles,
    streeteasy_df_columns[1]: streeteasy_addresses,
    streeteasy_df_columns[2]: streeteasy_prices,
    streeteasy_df_columns[3]: streeteasy_bedrooms,
    streeteasy_df_columns[4]: streeteasy_bathrooms,
    streeteasy_df_columns[5]: streeteasy_sq_footage
}

In [4]:
# Function to build list of urls to iterate over to scrape.
def create_page_urls(pages):
    for page in pages:
        page_urls.append(streeteasy_url + '?page={}'.format(page))
    print(page_urls)

In [5]:
def streeteasy_scraper(page_to_scrape):
    # Make request to webpage and read in the HTML.
    # User-agent header was added to avoid 403 error.
    req = Request(page_to_scrape, headers={'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'})

    make_request = urlopen(req)
    read_html = make_request.read()

    html_soup = BeautifulSoup(read_html, 'html.parser')

    # Each listing is stored in an unordered list and each listing card is a list item in this unordered list.
    # We are grabbing each list item, li, that has the class searchCardList--listItem since those are the results of our search.

    apts_for_sale = html_soup.find_all('li', class_='searchCardList--listItem')

    # Iterating through each listing in apts_for_sale and grabbing the title of the listing, address, and the price.
    # The title would be something along the lines of 'Condo in Murray Hill' or 'Apartment in Murray Hill.'

    for listing in apts_for_sale:
        title = listing.find('p', class_='listingCardLabel-grey').text.strip('\n').strip()
        address = listing.find('address', class_='listingCard-addressLabel').text.strip('\n')
        pricelist = listing.find('span', class_='price').text.strip('$').split(',')
        price = ''.join(pricelist)

        # The spans that contain information on bathrooms, bedrooms, and square footage have the same class so find all spans.
        listDetailTexts = listing.find_all('span', class_='listingDetailDefinitionsText')

        streeteasy_titles.append(title)
        streeteasy_addresses.append(address)
        streeteasy_prices.append(price)
        streeteasy_bedrooms.append(listDetailTexts[0].text)
        streeteasy_bathrooms.append(listDetailTexts[1].text)
        
        # Not all listings have a square footage so we need to check to make sure there are more than 2 values.
        # The first one is the bedrooms, the second one is the bathrooms, and the third one, if it exists, is the square footage.
        
        if (len(listDetailTexts) > 2):
            sqft_of_listing = listDetailTexts[2].text.strip().split('\n')[0].split(',')
            streeteasy_sqft_value = ''.join(sqft_of_listing)
            streeteasy_sq_footage.append(streeteasy_sqft_value)
        else:
            streeteasy_sq_footage.append('NA')

In [6]:
# Create function to scrape pages and then build the dataframe.
def build_streeteasy_df(list_of_pages):
    for page in page_urls:
        streeteasy_scraper(page)
    
    streeteasy_df = pd.DataFrame(data=streeteasy_dict)
    
    # Uncomment line below to save to a csv.
    #streeteasy_df.to_csv('./streeteasy_data.csv', index=False)
    return streeteasy_df

In [7]:
create_page_urls(pages)
build_streeteasy_df(page_urls)

['https://streeteasy.com/2-bedroom-apartments-for-sale/murray-hill/price:-2000000', 'https://streeteasy.com/2-bedroom-apartments-for-sale/murray-hill/price:-2000000?page=2', 'https://streeteasy.com/2-bedroom-apartments-for-sale/murray-hill/price:-2000000?page=3']


Unnamed: 0,Title,Address,Price,Bedrooms,Bathrooms,Square Footage
0,Condo in Murray Hill,308 East 38th Street #5A,1245000,2 Beds,2 Baths,1115.0
1,Condo in Murray Hill,333 East 34th Street #4L,1295000,2 Beds,2 Baths,1185.0
2,Condo in Murray Hill,630 First Avenue #17K,1260000,2 Beds,2 Baths,983.0
3,Co-op in Murray Hill,242 East 38th Street #6C,750000,2 Beds,1 Bath,
4,Condo in Murray Hill,308 East 38th Street #8B,1399000,2 Beds,2 Baths,1190.0
5,Condo in Murray Hill,250 East 40th Street #17B,1250000,2 Beds,2 Baths,1002.0
6,Condo in Murray Hill,250 East 40th Street #18B,1295000,2 Beds,2 Baths,1100.0
7,Co-op in Murray Hill,305 East 40th Street #12K,930000,2 Beds,1 Bath,
8,Condo in Murray Hill,143 East 34th Street #9N,1625000,2 Beds,2 Baths,1200.0
9,Condo in Murray Hill,333 East 34th Street #PHC,1395000,2 Beds,2 Baths,1200.0
