In [44]:
import requests
from bs4 import BeautifulSoup
import csv
import logging
import re

In [45]:
def fetch_page(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(url, headers=headers)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")

In [46]:
url = "https://streeteasy.com/for-rent/manhattan/beds:1?page=1"
headers = {
    "User-Agent": (
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/118.0"
    )
}

resp = requests.get(url, headers=headers)
soup = BeautifulSoup(resp.text, "html.parser")

cards = soup.select('li.sc-541ed69f-1')
print(f"Found {len(cards)} listings")

Found 14 listings


In [47]:
card = cards[0]
# print(card.prettify())

In [48]:
"""
Parse the title string to extract the unit type and neighborhood.

Challenges with the raw title text:
- Sometimes the word "in" is jammed without spaces, e.g. "Rental unitin Chelsea".
  This causes simple splits on " in " to fail.
- Other times, "in" appears correctly as a separate word, e.g. "Mixed-use Building in Midtown".
- We need to fix the jammed "in" case without breaking correctly spaced titles.
- Naively replacing all occurrences of "in" can incorrectly split words like "Building".

How this function solves the issues:
- First, normalize all whitespace to single spaces.
- Then, specifically fix the jammed "in" only when it appears as part of a word followed immediately by "in" and a space,
  e.g. turning "unitin " into "unit in ".
- Finally, split on the first correctly spaced " in " to separate unit type and neighborhood,
  preserving the integrity of words like "Building".

This approach balances correcting bad formatting while preserving correct ones.
"""

def parse_title(title_text):
    """Extracts unit type and neighborhood from a title string."""
    if not title_text:
        return None, None

    # Normalize all whitespace
    cleaned = re.sub(r'\s+', ' ', title_text).strip()

    # Fix jammed 'in' (e.g., "unitin Chelsea" → "unit in Chelsea")
    cleaned = re.sub(r'(\w)in ', r'\1 in ', cleaned, count=1)

    # Now split on the first ' in '
    if " in " in cleaned:
        unit_type, neighborhood = cleaned.split(" in ", 1)
        return unit_type.strip(), neighborhood.strip()
    else:
        return cleaned, None

In [49]:
title_tag = card.select_one("p.ListingDescription-module__title___B9n4Z")
if title_tag:
    full_title = title_tag.get_text(strip=True)
    unit_type, neighborhood = parse_title(full_title)
    print("Unit Type:", unit_type)
    print("Neighborhood:", neighborhood)

Unit Type: Rental unit
Neighborhood: East Village


In [50]:
examples = [
    "Rental unitin Chelsea",
    "Co-op in Kips Bay",
    "Condoin Hell's Kitchen",
    "Studio in Midtown",
    "Mixed-use Building in Midtown",
]

for example in examples:
    t, n = parse_title(example)
    print(f"Type: {t} | Neighborhood: {n}")

Type: Rental unit | Neighborhood: Chelsea
Type: Co-op | Neighborhood: Kips Bay
Type: Condo | Neighborhood: Hell's Kitchen
Type: Studio | Neighborhood: Midtown
Type: Mixed-use Building | Neighborhood: Midtown


In [51]:
address_tag = card.select_one("a.ListingDescription-module__addressTextAction___xAFZJ")
if address_tag:
    address = address_tag.get_text(strip=True)
    url = address_tag["href"]
    print("Address:", address)
    print("URL:", url)

Address: 153 Avenue A #12J
URL: https://streeteasy.com/building/153-avenue-a-new_york/12j?featured=1


In [54]:
price_tag = card.select_one('span[class*="PriceInfo-module__price"]')
price_text = price_tag.get_text(strip=True) if price_tag else None
price = int(price_text.replace('$', '').replace(',', ''))

bed_tag = card.select_one('ul[class*="BedsBathsSqft"] li:nth-of-type(1) span')
beds_text = bed_tag.get_text(strip=True) if bed else None
beds = float(re.search(r'-?\d+\.?\d*', beds_text).group()) if beds_text else None 

bath_tag = card.select_one('ul[class*="BedsBathsSqft"] li:nth-of-type(2) span')
baths_text = bath_tag.get_text(strip=True) if bath else None
baths = float(re.search(r'-?\d+\.?\d*', baths_text).group()) if baths_text else None

# print(f"Price text: {price_text}")
print(f"Price: {price}")
# print(f"Beds text: {beds_text}")
print(f"Beds: {beds}")
# print(f"Baths text: {baths_text}")
print(f"Baths: {baths}")

Price: 3495
Beds: 1.0
Baths: 1.0


In [56]:
sqft_tag = card.select_one('ul[class*="BedsBathsSqft"] li:nth-of-type(3) span')
sqft_text = sqft_tag.get_text(strip=True) if sqft_tag else ""

sqft_clean = None
if sqft_text and sqft_text != "-ft²":
    sqft_clean = int("".join(filter(str.isdigit, sqft_text)))

print(f"Square footage: {sqft_clean}")

Square footage: None


In [60]:
def parse_sqft(sqft_text):
    """Parses a square footage string like '574 ft²' into an integer."""
    """Gets the digits, but ignores the superscript 2 for ft^2."""
    if sqft_text and sqft_text.strip() != "-ft²":
        match = re.search(r'\d[\d,]*', sqft_text)  # Match numbers like 1,200 or 574
        if match:
            return int(match.group(0).replace(",", ""))
    return None

examples = [
    "574 ft²",
    "1,200 ft²",
    "-ft²",
    None,
    "9 ft²",
]

for example in examples:
    sqft = parse_sqft(example)
    print(f"Square Footage: {sqft}")

Square Footage: 574
Square Footage: 1200
Square Footage: None
Square Footage: None
Square Footage: 9
