# Getting Real Estate Data from Homegate - per WebScraping

In [105]:
# Libraries
import os
import re
import time
import fnmatch
import numpy as np
import pandas as pd
import csv as csv
import io
import datetime

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Get current working directory
print(os.getcwd())

/workspaces/applied_research_methods/Assignment/getting_basics


# Import Data

In [70]:
# Show .csv - files in the directory
flist = fnmatch.filter(os.listdir('.'), '*.csv')
for i in flist:
    print(i)

# 1. Define your desired column names
column_names = [
    'web_scraper_order', 'web_scraper_start_url', 'rent_raw', 
    'rooms_raw', 'area_raw', 'address_raw', 'description_raw', 
    'text_raw', 'walkingtime_raw', 'nearest_statioon_raw'
]

# 2. "Unwrap" the file: Read lines, strip garbage, and parse the outer CSV layer
inner_csv_lines = []

with open('properties_zuerich_data.csv', 'r', encoding='utf-8') as f:
    # Skip the ~844 lines of metadata/garbage at the top
    # Adjust this number if your file changes
    for i in range(844):
        next(f)
        
    # Read the valid lines
    reader_lines = []
    for line in f:
        # Remove the trailing ';;;;;;' and newlines
        clean_line = line.strip().rstrip(';')
        if clean_line:
            reader_lines.append(clean_line)
            
    # Use csv.reader to automatically handle the outer quotes
    # This converts '"123","URL","Rent"' -> '123,URL,Rent'
    wrapper_reader = csv.reader(reader_lines, quotechar='"', delimiter=';')
    
    for row in wrapper_reader:
        if row:
            # The first element (row[0]) is the actual inner CSV string
            inner_csv_lines.append(row[0])

# 3. Load the unwrapped data into Pandas
# We create a virtual file buffer from the clean strings
inner_csv_buffer = io.StringIO('\n'.join(inner_csv_lines))

df = pd.read_csv(
    inner_csv_buffer,
    sep=',',             # Now it is a standard comma-separated file
    names=column_names,  # Force your column names
    index_col=False,     # Ensure Pandas doesn't use columns as an index
    on_bad_lines='skip'  # Skip any corrupted lines that might remain
)

# 4. Remove the header row if it ended up in the data
if str(df.iloc[0]['web_scraper_order']).startswith('web_scraper_order'):
    df = df.iloc[1:].reset_index(drop=True)

# Check the result
print("Shape:", df.shape)
print(df.head())


apartments_data_prepared_zuerich.csv
properties_zuerich_data.csv
Shape: (1055, 10)
  web_scraper_order                              web_scraper_start_url  \
0    1763742793-844  https://www.homegate.ch/mieten/immobilien/ort-...   
1    1763742793-846  https://www.homegate.ch/mieten/immobilien/ort-...   
2    1763742793-847  https://www.homegate.ch/mieten/immobilien/ort-...   
3    1763742793-849  https://www.homegate.ch/mieten/immobilien/ort-...   
4    1763742793-850  https://www.homegate.ch/mieten/immobilien/ort-...   

              rent_raw   rooms_raw area_raw  \
0    CHF 990.– / Woche    1 Zimmer     23m²   
1  CHF 1’850.– / Monat  2.5 Zimmer     74m²   
2  CHF 5’490.– / Monat  3.5 Zimmer     94m²   
3  CHF 4’075.– / Monat  3.5 Zimmer    101m²   
4  CHF 4’095.– / Monat  3.5 Zimmer    101m²   

                         address_raw  \
0        Baurstrasse 29, 8008 Zürich   
1  Imbisbühlstrasse 120, 8049 Zürich   
2  Mühlebachstrasse 121, 8008 Zürich   
3       Josefstrasse 23, 8005

## Count number of rows and columns in the data frame

In [71]:
# Dimension (rows, columns)
print('Dimension:', df.shape)

# Number of rows
print('Number of rows:', df.shape[0])

# Number of columns
print('Number of columns:', df.shape[1])

Dimension: (1055, 10)
Number of rows: 1055
Number of columns: 10


## Get data types (raw-format from web scraping)

In [72]:
# Get data types (note that in pandas, a string is referred to as 'object')
df.dtypes

web_scraper_order        object
web_scraper_start_url    object
rent_raw                 object
rooms_raw                object
area_raw                 object
address_raw              object
description_raw          object
text_raw                 object
walkingtime_raw          object
nearest_statioon_raw     object
dtype: object

## Count and identify missing values

In [73]:
# Count missing values
print(pd.isna(df).sum())

# Identify rows with missing values
df[df.isna().any(axis=1)].head(15)

web_scraper_order          0
web_scraper_start_url      0
rent_raw                 557
rooms_raw                562
area_raw                 615
address_raw              557
description_raw          684
text_raw                 873
walkingtime_raw          468
nearest_statioon_raw     527
dtype: int64


Unnamed: 0,web_scraper_order,web_scraper_start_url,rent_raw,rooms_raw,area_raw,address_raw,description_raw,text_raw,walkingtime_raw,nearest_statioon_raw
1,1763742793-846,https://www.homegate.ch/mieten/immobilien/ort-...,CHF 1’850.– / Monat,2.5 Zimmer,74m²,"Imbisbühlstrasse 120, 8049 Zürich",,,,
3,1763742793-849,https://www.homegate.ch/mieten/immobilien/ort-...,CHF 4’075.– / Monat,3.5 Zimmer,101m²,"Josefstrasse 23, 8005 Zürich",Modernes Wohnen direkt am Hauptbahnhof,Per 01.10. oder nach Vereinbarung vermieten wi...,,
4,1763742793-850,https://www.homegate.ch/mieten/immobilien/ort-...,CHF 4’095.– / Monat,3.5 Zimmer,101m²,"Josefstrasse 23, 8005 Zürich",Zentraler Wohntraum - Charmantes Leben in der ...,Per 01.10. oder nach Vereinbarung vermieten wi...,,
5,1763742793-851,https://www.homegate.ch/mieten/immobilien/ort-...,CHF 3’060.– / Monat,2.5 Zimmer,63m²,"Spiserstrasse 4, 8047 Zürich",Willkommen im neuen Hotspot für urbanes Leben ...,URBAN. INTERNATIONAL. INDIVIDUELL. Willkommen ...,,
6,1763742793-852,https://www.homegate.ch/mieten/immobilien/ort-...,CHF 900.– / Monat,35m² Wohnfläche,,"Erchenbühlstrasse, 8046 Zürich",,,,
7,1763742793-853,https://www.homegate.ch/mieten/immobilien/ort-...,CHF 900.– / Monat,250m² Wohnfläche,,"Erchenbühlstrasse, 8046 Zürich",,,,
8,1763742793-854,https://www.homegate.ch/mieten/immobilien/ort-...,CHF 3’510.– / Monat,2.5 Zimmer,87m²,"Ottikerstrasse, 8006 Zürich",,,,
9,1763742793-855,https://www.homegate.ch/mieten/immobilien/ort-...,CHF 2’600.– / Monat,2 Zimmer,47m²,"Forsterstrasse, 8044 Zürich",2 Zimmerwohnung mit Schwimmbad- und Saunazugan...,,,
10,1763742793-856,https://www.homegate.ch/mieten/immobilien/ort-...,CHF 3’225.– / Monat,2.5 Zimmer,72m²,"Akazienstr. 2, 8008 Zürich",Attikawohnung im Seefeld!,,,
11,1763742793-857,https://www.homegate.ch/mieten/immobilien/ort-...,CHF 3’536.– / Monat,3.5 Zimmer,88m²,"Seefeldstr. 96, 8008 Zürich",Moderne Wohnung im Seefeld mit Balkon!,,,


## Extract and save relevant information from raw data using regular expressions (regex)

In [78]:
# --- Data Extraction / Cleaning ---
# This code block must be run AFTER the df DataFrame is successfully loaded.

# 1. Rooms: Extract number before "Zimmer" (using the more robust regex)
df['rooms'] = df['rooms_raw'].astype(str).str.extract(r'([\d\.,]+)\s*Zimmer')
df['rooms'] = df['rooms'].str.replace(',', '.').astype(float)

# 2. Rent: Extract digits after "CHF"
df['rent'] = df['rent_raw'].astype(str).str.extract(r'CHF\s+([0-9\’\']+)')
df['rent'] = df['rent'].str.replace(r"[’']", "", regex=True).astype(float)

# 3. Area: Extract digits before "m²"
df['area'] = df['area_raw'].astype(str).str.extract(r'(\d+)\s*m²').astype(float)


# Show the cleaned DataFrame with new columns
print("Success! Data loaded and variables extracted.")
print(df[['rooms', 'rent', 'area', 'address_raw']].head())

Success! Data loaded and variables extracted.
   rooms    rent   area                        address_raw
0    1.0   990.0   23.0        Baurstrasse 29, 8008 Zürich
1    2.5  1850.0   74.0  Imbisbühlstrasse 120, 8049 Zürich
2    3.5  5490.0   94.0  Mühlebachstrasse 121, 8008 Zürich
3    3.5  4075.0  101.0       Josefstrasse 23, 8005 Zürich
4    3.5  4095.0  101.0       Josefstrasse 23, 8005 Zürich


### Extract the data with missing values

In [87]:
# Define the list of mandatory columns (the subset)
mandatory_columns = ['rent', 'rooms', 'area', 'address_raw']

# Drop rows where ANY value in the mandatory_columns is missing (NaN/None)
df_final = df.dropna(subset=mandatory_columns)

# Optional: Print a summary of the action
print(f"Original rows: {len(df)}")
print(f"Rows retained after dropping missing values: {len(df_final)}")

Original rows: 1055
Rows retained after dropping missing values: 439


## Create additional variables from the apartment's descriptions

## Change strings in to uppercase

In [98]:
# Change strings to uppercase 
df_final['description_raw'] = df_final['description_raw'].str.upper()
print(df_final['description_raw'].head(10), '\n')

df_final['text_raw'] = df_final['text_raw'].str.upper()
print(df_final['text_raw'].head(10))

0       COCOON COMFORT – COSY STUDIO SERVICED APARTMENT
1                                                   NaN
2                           SIENA - NATÜRLICH. ZUHAUSE.
3                MODERNES WOHNEN DIREKT AM HAUPTBAHNHOF
4     ZENTRALER WOHNTRAUM - CHARMANTES LEBEN IN DER ...
5     WILLKOMMEN IM NEUEN HOTSPOT FÜR URBANES LEBEN ...
8                                                   NaN
9     2 ZIMMERWOHNUNG MIT SCHWIMMBAD- UND SAUNAZUGAN...
10                            ATTIKAWOHNUNG IM SEEFELD!
11               MODERNE WOHNUNG IM SEEFELD MIT BALKON!
Name: description_raw, dtype: object 

0     COSY AND SMART! – KOMPAKT, ABER NICHT KLEINLIC...
1                                                   NaN
2     JETZT EINZIEHEN &AMP\n1763742793-848,HTTPS://W...
3     PER 01.10. ODER NACH VEREINBARUNG VERMIETEN WI...
4     PER 01.10. ODER NACH VEREINBARUNG VERMIETEN WI...
5     URBAN. INTERNATIONAL. INDIVIDUELL. WILLKOMMEN ...
8                                                   NaN
9        

### Create new variables

In [100]:
# Define classes (labels)
labels = ['0 - 49', '50 - 99', '100 - 9999']

# Use the .cut method from pandas to divide the numeric values in classes
df_final["area_cat"] = pd.cut(df_final['area'], bins=[0, 50, 100, 10000], labels=labels)

# Show original data and classes
df_final[['area', 'area_cat']]

Unnamed: 0,area,area_cat
0,23.0,0 - 49
1,74.0,50 - 99
2,94.0,50 - 99
3,101.0,100 - 9999
4,101.0,100 - 9999
...,...,...
1030,90.0,50 - 99
1031,100.0,50 - 99
1032,100.0,50 - 99
1033,82.0,50 - 99


In [101]:
# Create the new variable
df_final['price_per_m2'] = round(df_final['rent'] / df_final['area'], 2)

# Show values
df_final[['description_raw','rooms','area','rent','price_per_m2']]

# The variable should contain three levels ‘low’, ‘medium’, ‘high’
labels = ['low', 'medium', 'high']

df_final["price_per_m2_cat"] = pd.cut(df_final['price_per_m2'], bins=[0, 30, 60, 10000], labels=labels)
# Show values
df_final[['description_raw','rooms','area','rent','price_per_m2','price_per_m2_cat']]

Unnamed: 0,description_raw,rooms,area,rent,price_per_m2,price_per_m2_cat
0,COCOON COMFORT – COSY STUDIO SERVICED APARTMENT,1.0,23.0,990.0,43.04,medium
1,,2.5,74.0,1850.0,25.00,low
2,SIENA - NATÜRLICH. ZUHAUSE.,3.5,94.0,5490.0,58.40,medium
3,MODERNES WOHNEN DIREKT AM HAUPTBAHNHOF,3.5,101.0,4075.0,40.35,medium
4,ZENTRALER WOHNTRAUM - CHARMANTES LEBEN IN DER ...,3.5,101.0,4095.0,40.54,medium
...,...,...,...,...,...,...
1030,,3.5,90.0,2700.0,30.00,low
1031,URBAN WOHNEN - NACHHALTIG LEBEN: IHR NEUES ZUH...,4.5,100.0,4650.0,46.50,medium
1032,URBAN WOHNEN - NACHHALTIG LEBEN: IHR NEUES ZUH...,4.5,100.0,4690.0,46.90,medium
1033,URBAN WOHNEN - NACHHALTIG LEBEN: IHR NEUES ZUH...,1.5,82.0,3830.0,46.71,medium


## Including current datetime

In [106]:
# Get and format datetime
df_final['datetime'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# Show values
df_final[['description_raw','rooms','area','rent','price_per_m2', 'datetime']]

Unnamed: 0,description_raw,rooms,area,rent,price_per_m2,datetime
0,COCOON COMFORT – COSY STUDIO SERVICED APARTMENT,1.0,23.0,990.0,43.04,2025-11-28 11:21:17
1,,2.5,74.0,1850.0,25.00,2025-11-28 11:21:17
2,SIENA - NATÜRLICH. ZUHAUSE.,3.5,94.0,5490.0,58.40,2025-11-28 11:21:17
3,MODERNES WOHNEN DIREKT AM HAUPTBAHNHOF,3.5,101.0,4075.0,40.35,2025-11-28 11:21:17
4,ZENTRALER WOHNTRAUM - CHARMANTES LEBEN IN DER ...,3.5,101.0,4095.0,40.54,2025-11-28 11:21:17
...,...,...,...,...,...,...
1030,,3.5,90.0,2700.0,30.00,2025-11-28 11:21:17
1031,URBAN WOHNEN - NACHHALTIG LEBEN: IHR NEUES ZUH...,4.5,100.0,4650.0,46.50,2025-11-28 11:21:17
1032,URBAN WOHNEN - NACHHALTIG LEBEN: IHR NEUES ZUH...,4.5,100.0,4690.0,46.90,2025-11-28 11:21:17
1033,URBAN WOHNEN - NACHHALTIG LEBEN: IHR NEUES ZUH...,1.5,82.0,3830.0,46.71,2025-11-28 11:21:17


### Get data types

In [107]:
df.dtypes

web_scraper_order          object
web_scraper_start_url      object
rent_raw                   object
rooms_raw                  object
area_raw                   object
address_raw                object
description_raw            object
text_raw                   object
walkingtime_raw            object
nearest_statioon_raw       object
rooms                     float64
rent                      float64
area                      float64
central_location          float64
area_cat                 category
price_per_m2              float64
price_per_m2_cat         category
dtype: object

### Save data to file

In [108]:
df.to_csv('./properties_data_prepared_zuerich.csv', 
          sep=",", 
          encoding='utf-8',
          index=False)

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [109]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Linux | 6.8.0-1030-azure
Datetime: 2025-11-28 12:00:29
Python Version: 3.11.14
-----------------------------------
