In [1]:
pip install geopy folium

Collecting folium
  Downloading folium-0.17.0-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting branca>=0.6.0 (from folium)
  Downloading branca-0.7.2-py3-none-any.whl.metadata (1.5 kB)
Collecting xyzservices (from folium)
  Downloading xyzservices-2024.6.0-py3-none-any.whl.metadata (4.0 kB)
Downloading folium-0.17.0-py2.py3-none-any.whl (108 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.4/108.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading branca-0.7.2-py3-none-any.whl (25 kB)
Downloading xyzservices-2024.6.0-py3-none-any.whl (83 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.9/83.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xyzservices, branca, folium
Successfully installed branca-0.7.2 folium-0.17.0 xyzservices-2024.6.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnoti

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import folium
from folium.plugins import MarkerCluster
from IPython.display import display
import branca.colormap as cm

# Geopy setup
geolocator = Nominatim(user_agent="wg_scraper")

def get_wg_listings(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

def geocode_address(address):
    try:
        location = geolocator.geocode(address)
        if location:
            return location.latitude, location.longitude
    except GeocoderTimedOut:
        return None, None
    return None, None

def parse_listing(listing):
    title = listing.find('h3', class_='truncate_title noprint')
    location = listing.find('div', class_='col-xs-11')
    price = listing.find('div', class_='col-xs-3')
    availability = listing.find('div', class_='col-xs-5 text-center')
    size = listing.find('div', class_='col-xs-3 text-right')
    room_info = listing.find_all('div', class_='col-xs-2')

    # Using BeautifulSoup to locate owner and online duration
    owner = listing.find('span', class_='ml5')
    online_duration = listing.find('span', style='color: #218700;')

    # Extracting location information
    location_text = location.text.strip() if location else None
    wg_type, district, street = None, None, None

    if location_text:
        location_parts = [part.strip() for part in location_text.split("|")]
        if len(location_parts) == 3:
            wg_type, city_district, street = location_parts
        elif len(location_parts) == 2:
            wg_type, city_district = location_parts
        elif len(location_parts) == 1:
            wg_type = location_parts[0]

        if city_district and "Berlin" in city_district:
            district = city_district.replace("Berlin", "").strip()

    # Remove extra spaces between dates in availability
    availability_text = availability.text.strip().replace(' ', '') if availability else None

    # Geocoding the street 
    latitude, longitude = None, None
    if street:
        query = f"{street}, Berlin, Germany"
        latitude, longitude = geocode_address(query)

    # Convert price and size to float, handle possible conversion errors
    try:
        price_value = float(price.text.strip().replace('€', '').replace(' ', '')) if price else None
    except ValueError:
        price_value = None

    try:
        size_value = float(size.text.strip().replace('m²', '').replace(' ', '')) if size else None
    except ValueError:
        size_value = None

    data = {
        'Title': title.text.strip() if title else None,
        'WG Type': wg_type,
        'City/District': district,
        'Street': street,
        'Price': price_value,
        'Availability': availability_text,
        'Size': size_value,
        'Owner': owner.text.strip() if owner else None,
        'Online Duration': online_duration.text.strip() if online_duration else None,
        'Latitude': latitude,
        'Longitude': longitude
    }

    return data

def scrape_wg_data(base_url, num_listings):
    listings_data = []
    page = 0

    while len(listings_data) < num_listings:
        url = f"{base_url}?page={page}"
        soup = get_wg_listings(url)
        listings = soup.find_all('div', class_='offer_list_item')
        
        if not listings:
            break

        for listing in listings:
            data = parse_listing(listing)
            listings_data.append(data)
            if len(listings_data) >= num_listings:
                break

        page += 1
        time.sleep(1)  

    df = pd.DataFrame(listings_data)
    return df

# Scrape data
BASE_URL = 'https://www.wg-gesucht.de/wg-zimmer-in-Berlin.8.0.1.0.html'
NUM_LISTINGS = 500
wg_data_df = scrape_wg_data(BASE_URL, NUM_LISTINGS)

# Drop rows with missing latitude, longitude, and size
wg_data_df = wg_data_df.dropna(subset=['Latitude', 'Longitude', 'Size'])

# Create a colormap for price
min_price = wg_data_df['Price'].min()
max_price = wg_data_df['Price'].max()
colormap = cm.LinearColormap(colors=['green', 'yellow', 'red'], vmin=min_price, vmax=max_price, caption='Price (€)')

# Create a Folium map
m = folium.Map(location=[52.5200, 13.4050], zoom_start=11, tiles='cartodbpositron')

# Adding markers to the map
marker_cluster = MarkerCluster().add_to(m)

for idx, row in wg_data_df.iterrows():
    color = colormap(row['Price'])
    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=5,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.7,
        popup=folium.Popup(f"{row['Title']}<br>Price: {row['Price']}€<br>Size: {row['Size']}m²<br>Street: {row['Street']}", max_width=300),
        tooltip=row['Title']
    ).add_to(marker_cluster)

# Add colormap to the map
colormap.add_to(m)

# Display the map
display(m)

# Save the map to an HTML file
m.save('map.html')