In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def get_wg_listings(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

def parse_listing(listing):
    title = listing.find('h3', class_='truncate_title noprint')
    location = listing.find('div', class_='col-xs-11')
    price = listing.find('div', class_='col-xs-3')
    availability = listing.find('div', class_='col-xs-5 text-center')
    size = listing.find('div', class_='col-xs-3 text-right')

    # Using BeautifulSoup to find the owner and amount of time spent online
    owner = listing.find('span', class_='ml5')
    online_duration = listing.find('span', style='color: #218700;')

    # Extracting location information
    location_text = location.text.strip() if location else None
    wg_type, district, street = None, None, None

    if location_text:
        location_parts = [part.strip() for part in location_text.split("|")]
        if len(location_parts) == 3:
            wg_type, city_district, street = location_parts
        elif len(location_parts) == 2:
            wg_type, city_district = location_parts
        elif len(location_parts) == 1:
            wg_type = location_parts[0]

        if city_district and "Berlin" in city_district:
            district = city_district.replace("Berlin", "").strip()

    # Remove extra spaces between dates in availability
    availability_text = availability.text.strip().replace(' ', '') if availability else None

    data = {
        'Title': title.text.strip() if title else None,
        'WG Type': wg_type,
        'City/District': district,
        'Street': street,
        'Price': f"{price.text.strip().replace('€', '').replace(' ', '')} €" if price else None,
        'Availability': availability_text,
        'Size': f"{size.text.strip().replace('m²', '').replace(' ', '')} m²" if size else None,
        'Owner': owner.text.strip() if owner else None,
        'Online Duration': online_duration.text.strip() if online_duration else None
    }

    return data

def scrape_wg_data(base_url, num_listings):
    listings_data = []
    page = 0

    while len(listings_data) < num_listings:
        url = f"{base_url}?page={page}"
        soup = get_wg_listings(url)
        listings = soup.find_all('div', class_='offer_list_item')
        
        if not listings:
            break

        for listing in listings:
            data = parse_listing(listing)
            listings_data.append(data)
            if len(listings_data) >= num_listings:
                break

        page += 1
        time.sleep(1)  # In order to prevent sending too many requests at once,

    df = pd.DataFrame(listings_data)
    return df

# Scrape data
BASE_URL = 'https://www.wg-gesucht.de/wg-zimmer-in-Berlin.8.0.1.0.html'
NUM_LISTINGS = 200
wg_data_df = scrape_wg_data(BASE_URL, NUM_LISTINGS)
wg_data_df

Unnamed: 0,Title,WG Type,City/District,Street,Price,Availability,Size,Owner,Online Duration
0,BEST AREA OF KREUZBERG - SUNNY & CHARMING ROOM...,2er WG,Kreuzberg,Nahe Mehringdamm,35 €,01.08.2024\n\n-30.09.2024,20 m²,Kreuzberg Privat,Online: 5 Minuten
1,Haus Wg sucht Zuwachs,6er WG,Hermsdorf,Auguste-Viktoria-Str.14,598 €,01.08.2024,17 m²,Flora Rost,Online: 5 Minuten
2,* SONNIGES CHARMANTES ZIMMER IN BESTER LAGE KR...,2er WG,Kreuzberg,Nahe Mehringdamm,725 €,01.08.2024\n\n-31.08.2024,20 m²,Kreuzberg Privat,Online: 5 Minuten
3,★ 27m2 Bedroom ★ Bright and Clean ★,2er WG,Wedding,Exerziertrasse 21,750 €,20.07.2024,27 m²,Laura,Online: 8 Minuten
4,"Super central, registration, S-Bahncycle, Pren...",3er WG,Prenzlauer Berg,Wichertstraße,798 €,01.08.2024\n\n-31.01.2025,26 m²,Frau Klinck,Online: 8 Minuten
...,...,...,...,...,...,...,...,...,...
195,SUBLET 19th July - 24th August 2024 (37 nights...,2er WG,Neukölln,Sonnenallee,600 €,19.08.2024\n\n-24.08.2024,21 m²,Levke,Online: 9 Minuten
196,♥️ Großes 24qm Altbau Zimmer☀️ 96qm 3er Wg- Mö...,3er WG,Pankow,Kreuzstraße 15,670 €,01.08.2024,24 m²,Leonie S.,Online: 10 Minuten
197,Appartement in Friedrichshain for short time,2er WG,,Fr.,42 €,28.07.2024\n\n-01.12.2024,20 m²,Eric S,Online: 11 Minuten
198,Furnished comfortable room for female in 2 bed...,2er WG,Tempelhof,Steinhellenweg 18,729 €,31.07.2024\n\n-31.12.2026,16 m²,Gaya,Online: 11 Minuten
