
# Real Estate Web Scraping Project

This notebook demonstrates a web scraping project focused on extracting real estate data from Redfin. The goal is to collect property information, such as price, number of beds, baths, area and more.

## Libraries Used
- `requests`: For sending HTTP requests to websites.
- `BeautifulSoup`: For parsing HTML and extracting data.
- `pandas`: For organizing and exporting data into a CSV file.


In [3]:
import requests
import pandas as pd
from functions import *
from bs4 import BeautifulSoup

# Define the URL and headers for scraping

In [5]:
BASE_URL = 'https://www.redfin.com/city/30749/NY/New-York'
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
}

In [6]:
MAX_RESULTS = 500  # Maximum number of homes to scrape

# Start scraping until the maximum number of results is reached

In [8]:
page_number = 1
property_list = []
while len(property_list) < MAX_RESULTS:
    try:
        response = requests.get(f'{BASE_URL}/page-{page_number}', headers=HEADERS)
        response.raise_for_status()  # Check for request errors
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        break

    soup = BeautifulSoup(response.content, "html.parser")
    containers = soup.find_all('div', {'class': 'HomeCardContainer flex justify-center'})

    if not containers:
        print("No more homes found")
        break

    for container in containers:
        address = get_home_address(container)
        street, neighborhood, zip_code = parse_address(address)
        property_list.append({
            'price': get_home_price(container),
            'beds': get_beds_num(container),
            'baths': get_baths_num(container),
            'area_value': get_area_value(container),
            'area_label': get_area_label(container),
            'street': street,
            'neighborhood': neighborhood,
            'zip_code': zip_code,
            'listing_by': get_listing_by(container)
        })

        if len(property_list) >= MAX_RESULTS:
            break

    page_number += 1

# Saving the Data

In [10]:
df = pd.DataFrame(property_list)
df.to_csv('data/homes.csv', index=False)
df.sample(5)

Unnamed: 0,price,beds,baths,area_value,area_label,street,neighborhood,zip_code,listing_by
210,"$3,350,000",3,2.5,1854,sq ft,50 Bridge Park Dr Unit 3B,Brooklyn,11201,Serhant LLC • Provided by REBNY
361,"$641,175",0,1,515,sq ft,136-80 41 Ave Unit 4H,Flushing,11355,United Real Estate Fortune
46,"$308,000",1,1,—,sq ft,65-05 Yellowstone Blvd Unit 5E,Forest Hills,11375,Tru International Realty Corp
487,"$400,000",—,—,4000,sq ft (lot),365 Forbell St,East New York,11208,St Rose Realty
202,"$950,000",5,2,2100,sq ft,65 Alter Ave,Staten Island,10304,"Ed Bruno Realty, LLC"
