# Web Scraping Rental Apartment Data with Python and Beautifulsoup

Data from: 
- https://www.immoscout24.ch/de/immobilien/mieten/ort-zuerich?pn=1  
- https://www.immoscout24.ch/de/immobilien/mieten/ort-zuerich?pn=2  

## Libraries and settings

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from pathlib import Path

## Example of scraping data from websites with BeautifulSoup

In [None]:
# Load HTML from local files (offline) instead of fetching the website
html_paths = [
    Path('./data/immo_listings_pn_01.html'),
    Path('./data/immo_listings_pn_02.html'),
].copy()

html_texts = [p.read_text(encoding='utf-8') for p in html_paths]
soups = [BeautifulSoup(t, 'html.parser') for t in html_texts]

# Extract listing blocks from each file (hashed class names -> match by substring)
listings = []
for p, s in zip(html_paths, soups):
    listings = s.select('div[class*="HgListingRoomsLivingSpacePrice_roomsLivingSpacePrice_"]')
    print(f"{p.name}: {len(listings)} listing blocks")
    listings.extend(listings)

print('Total listing blocks:', len(listings))

# Show first 10 listing blocks (from combined pages)
listings[:10]

In [None]:
# Debug: check whether we loaded normal HTML files
for p, t, s in zip(html_paths, html_texts, soups):
    print('Source:', p.resolve())
    print('Length HTML:', len(t))
    print('Title:', s.title.get_text(strip=True) if s.title else None)
    sample = t[:400].replace('\n', ' ')
    print('HTML sample:', sample)
    print('Contains HgListingRoomsLivingSpacePrice:', 'HgListingRoomsLivingSpacePrice' in t)
    print('---')

In [None]:
# Re-parse the saved HTML files to show it's consistent
for p, t in zip(html_paths, html_texts):
    soup_check = BeautifulSoup(t, 'html.parser')
    print(p.name, '| title:', soup_check.title.get_text(strip=True) if soup_check.title else None, '| length:', len(t))

In [None]:
# Optional note: If you scrape the live site, you may need a real browser engine (JS rendering / bot checks).
# In this exercise we run fully offline from ./data/immo_listings.html.
print('Offline mode: using local HTML file; browser rendering step skipped.')

## Web Scraper function

In [None]:
# Function to extract the listings from a given URL OR a local HTML file
def extract_listing_info(source):
    source_path = Path(str(source))
    if source_path.suffix.lower() in ('.html', '.htm') and source_path.exists():
        html = source_path.read_text(encoding='utf-8')
    else:
        response = requests.get(source)
        response.raise_for_status()
        html = response.text

    # Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Extract the listing cards (hashed class suffix changes)
    listings = soup.select('div[class*="HgListingCard_info_"]')
    
    extracted_data = []
    for listing in listings:
        # Extract address
        address_tag = listing.find('address')
        address = address_tag.get_text(strip=True) if address_tag else 'N/A'

        # Extract rooms, living area, and price
        info_section = listing.select_one('div[class*="HgListingRoomsLivingSpacePrice_roomsLivingSpacePrice_"]')
        if info_section:
            info_text = [strong.get_text(strip=True) for strong in info_section.find_all('strong')]
            rooms = info_text[0] if len(info_text) > 0 else 'N/A'
            living_area = info_text[1] if len(info_text) > 1 else 'N/A'
            price_tag = info_section.select_one('span[class*="HgListingRoomsLivingSpacePrice_price_"]')
            price = price_tag.get_text(strip=True) if price_tag else 'N/A'
        else:
            rooms, living_area, price = 'N/A', 'N/A', 'N/A'

        # Extract title and description
        title_tag = listing.select_one('p[class*="HgListingDescription_title_"]')
        title = title_tag.get_text(strip=True) if title_tag else 'N/A'

        description_tag = listing.select_one('p[class*="HgListingDescription_extra-large_"], p[class*="HgListingDescription_large_"]')
        description = description_tag.get_text(strip=True) if description_tag else 'N/A'

        extracted_data.append({
            'address_raw': address,
            'rooms_raw': rooms,
            'area_raw': living_area,
            'price_raw': price,
            'title_raw': title,
            'description_raw': description
        })
    
    return extracted_data

## Call the Web Scraper function to get apartment data from the 1st page

In [None]:
# Parse multiple saved HTML pages (offline)
sources = [
    Path('./data/immo_listings_pn_01.html'),
    Path('./data/immo_listings_pn_02.html'),
].copy()

# Extract listing information from both pages
data = []
for src in sources:
    data.extend(extract_listing_info(src))

# Create DataFrame from combined data
df = pd.DataFrame(data)

# Show the first few rows of the DataFrame
df.head()

## Call the Web Scraper function to get apartment data

In [None]:
# Offline version: parse multiple saved HTML pages
sources = [
    Path('./data/immo_listings_pn_01.html'),
    Path('./data/immo_listings_pn_02.html'),
].copy()

data = []
for src in sources:
    data.extend(extract_listing_info(src))

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('./data/rental_apartments_zuerich.csv', index=False)

print("\nData saved to rental_apartments_zuerich.csv")

# Show the first few rows of the DataFrame
df.head()

## Extract prices from strings using a regular expression (regex)

In [None]:
# Extract prices from the 'price_raw' column using a regular expression (regex)
df['price'] = df['price_raw'].str.extract(r'(\d[\d’]*\d)')

# Remove thousands separator and convert to float
df['price'] = df['price'].str.replace('’', '').astype(float)

# Show the first few rows of the DataFrame (selected columns)
df[['address_raw', 'rooms_raw', 'area_raw', 'price_raw', 'price']].head()

## Create histogram of prices

In [None]:
# Plot a histogram of the prices
plt.figure(figsize=(7, 4))
plt.hist(df['price'], bins=30, color='greenyellow', edgecolor='gray')
plt.title('Histogram of prices')
plt.xlabel('Price (CHF)')
plt.ylabel('Frequency')
plt.show()


### Jupyter notebook --footer info-- (please always provide this at the end of each notebook)

In [None]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')