
# Real Estate Web Scraping Project

This notebook demonstrates a web scraping project focused on extracting real estate data from Redfin. The goal is to collect property information, such as price, number of beds, baths, area and more.

## Libraries Used
- `requests`: For sending HTTP requests to websites.
- `BeautifulSoup`: For parsing HTML and extracting data.
- `pandas`: For organizing and exporting data into a CSV file.


In [None]:
import os
import requests
response = requests.get("https://raw.githubusercontent.com/ziadsalama95/real-estate-web-scraping/main/functions.py")
with open("functions.py", "wb") as file:
    file.write(response.content)
from functions import *
import pandas as pd
from bs4 import BeautifulSoup

# Define the URL and headers for scraping

In [None]:
BASE_URL = 'https://www.redfin.com/city/30749/NY/New-York'
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
}

In [None]:
MAX_RESULTS = 15000  # Maximum number of homes to scrape

# Start scraping until the maximum number of results is reached

In [None]:
property_list = []

In [None]:
page_number = 1
while len(property_list) < MAX_RESULTS:
    try:
        response = requests.get(f'{BASE_URL}/page-{page_number}', headers=HEADERS)
        response.raise_for_status()  # Check for request errors
    except requests.exceptions.RequestException as e:
        continue

    soup = BeautifulSoup(response.content, "html.parser")
    containers = soup.find_all('div', {'class': 'HomeCardContainer flex justify-center'})

    if not containers:
        print("No more homes found")
        break

    for container in containers:
        address = get_home_address(container)
        street, neighborhood, zip_code = parse_address(address)
        property_list.append({
            'price': get_home_price(container),
            'beds': get_beds_num(container),
            'baths': get_baths_num(container),
            'area_value': get_area_value(container),
            'area_label': get_area_label(container),
            'street': street,
            'neighborhood': neighborhood,
            'zip_code': zip_code,
            'listing_by': get_listing_by(container)
        })

        if len(property_list) >= MAX_RESULTS:
            break

    print(f"Got {len(property_list)}, page: {page_number}")
    page_number += 1

In [None]:
df = pd.DataFrame(property_list)
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True)
df.shape

# Saving the Data

In [None]:
os.makedirs('data', exist_ok=True)
df.to_csv('data/homes.csv', index=False)
df.sample(5)