
# Real Estate Web Scraping Project

This notebook demonstrates a web scraping project focused on extracting real estate data from Redfin. The goal is to collect property information, such as price, number of beds, baths, area and more.

## Libraries Used
- `requests`: For sending HTTP requests to websites.
- `BeautifulSoup`: For parsing HTML and extracting data.
- `pandas`: For organizing and exporting data into a CSV file.


In [1]:
import os
import requests
response = requests.get("https://raw.githubusercontent.com/ziadsalama95/real-estate-web-scraping/main/functions.py")
with open("functions.py", "wb") as file:
    file.write(response.content)
from functions import *
import pandas as pd
from bs4 import BeautifulSoup

# Define the URL and headers for scraping

In [2]:
BASE_URL = 'https://www.redfin.com/city/30749/NY/New-York'
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
}

In [3]:
MAX_RESULTS = 15000  # Maximum number of homes to scrape

# Start scraping until the maximum number of results is reached

In [4]:
property_list = []

In [5]:
page_number = 1
while len(property_list) < MAX_RESULTS:
    try:
        response = requests.get(f'{BASE_URL}/page-{page_number}', headers=HEADERS)
        response.raise_for_status()  # Check for request errors
    except requests.exceptions.RequestException as e:
        continue

    soup = BeautifulSoup(response.content, "html.parser")
    containers = soup.find_all('div', {'class': 'HomeCardContainer flex justify-center'})

    if not containers:
        print("No more homes found")
        break

    for container in containers:
        address = get_home_address(container)
        street, neighborhood, zip_code = parse_address(address)
        property_list.append({
            'price': get_home_price(container),
            'beds': get_beds_num(container),
            'baths': get_baths_num(container),
            'area_value': get_area_value(container),
            'area_label': get_area_label(container),
            'street': street,
            'neighborhood': neighborhood,
            'zip_code': zip_code,
            'listing_by': get_listing_by(container)
        })

        if len(property_list) >= MAX_RESULTS:
            break

    print(f"Got {len(property_list)}, page: {page_number}")
    page_number += 1

Got 40, page: 1
Got 80, page: 2
Got 119, page: 3
Got 158, page: 4
Got 197, page: 5
Got 236, page: 6
Got 275, page: 7
Got 314, page: 8
Got 343, page: 9
Got 372, page: 10
Got 401, page: 11
Got 430, page: 12
Got 459, page: 13
Got 488, page: 14
Got 517, page: 15
Got 546, page: 16
Got 575, page: 17
Got 604, page: 18
Got 633, page: 19
Got 662, page: 20
Got 691, page: 21
Got 720, page: 22
Got 749, page: 23
Got 778, page: 24
Got 807, page: 25
Got 836, page: 26
Got 865, page: 27
Got 894, page: 28
Got 923, page: 29
Got 952, page: 30
Got 981, page: 31
Got 1010, page: 32
Got 1039, page: 33
Got 1068, page: 34
Got 1097, page: 35
Got 1126, page: 36
Got 1155, page: 37
Got 1184, page: 38
Got 1213, page: 39
Got 1242, page: 40
Got 1271, page: 41
Got 1300, page: 42
Got 1329, page: 43
Got 1358, page: 44
Got 1387, page: 45
Got 1416, page: 46
Got 1445, page: 47
Got 1474, page: 48
Got 1503, page: 49
Got 1532, page: 50
Got 1561, page: 51
Got 1590, page: 52
Got 1619, page: 53
Got 1648, page: 54
Got 1678, page: 

In [6]:
df = pd.DataFrame(property_list)
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True)
df.shape

(2228, 10)

# Saving the Data

In [7]:
os.makedirs('data', exist_ok=True)
df.to_csv('data/homes.csv', index=False)
df.sample(5)

Unnamed: 0,index,price,beds,baths,area_value,area_label,street,neighborhood,zip_code,listing_by
2035,12180,"$450,000",—,—,4000,sq ft (lot),97 162 Ave Ave,Howard Beach,11414,Howard Beach Realty Inc
1621,9387,"$1,200,000",10,4,2853,sq ft,48 Schenck Ave,Brooklyn,11207,Douglas Elliman Real Estate • Provided by REBNY
1793,10547,"$2,495,000",4,2,3200,sq ft,578 Myrtle Ave,Brooklyn,11205,Corcoran Group • Provided by REBNY
405,806,"$700,000",—,—,2522,sq ft (lot),2998 Fulton St,Brooklyn,11208,J Shayovitz Real Estate Corp
874,4264,"$2,550,000",8,4,—,sq ft,555 W 183rd St,New York,10033,BERKSHIRE HATHAWAY
