# Importing and preparing rental apartments data

## Libraries and settings

In [1]:
# Libraries
import os
import re
import fnmatch
import datetime
import numpy as np
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Show current working directory
print(os.getcwd())

/workspaces/data_analytics/Week_03


## Importing data

In [2]:
# Get current working directory
print(os.getcwd())

# Show all files in the directory
flist = fnmatch.filter(os.listdir('.'), '*.csv')
for i in flist:
    print(i)

# Read the data to a pandas data frame
df = pd.read_csv('apartments_data_zuerich.csv', sep=',', encoding='utf-8')

# Get number of rows and columns
df.shape

/workspaces/data_analytics/Week_03
apartments_data_zuerich.csv
apartments_data_geocoded.csv
apartments_data_prepared.csv
apartments_data_enriched.csv


(1008, 7)

## Count number of rows and columns in the data frame

In [3]:
# Dimension (rows, columns)
print('Dimension:', df.shape)

# Number of rows
print('Number of rows:', df.shape[0])

# Number of columns
print('Number of columns:', df.shape[1])

Dimension: (1008, 7)
Number of rows: 1008
Number of columns: 7


## Get data types (raw-format from web scraping)

In [4]:
# Get data types (note that in pandas, a string is referred to as 'object')
df.dtypes

web-scraper-order        object
web-scraper-start-url    object
rooms_area_price_raw     object
address_raw              object
price_raw                object
description_raw          object
text_raw                 object
dtype: object

## Extract and save relevant information from raw data using regular expressions (regex)

### Extract number of rooms

In [5]:
# Extract values from 'rooms_area_price_raw' strings
rooms = []
for i in df['rooms_area_price_raw']:
    d1 = re.findall('(.*)Zimmer', i)
    try:
        d2 = d1[0].strip().replace(',', '.')
    except:
        d2 = None
    rooms.append(d2)

# Save as new variable in the pandas data frame
df['rooms'] = pd.Series(rooms, dtype="float64")
    
# Print first 5 values
print(df['rooms_area_price_raw'].head(5), '\n')
print(df['rooms'].head(5), '\n')

0    3,5 Zimmer, 122 m², CHF 3180.—
1     2,5 Zimmer, 78 m², CHF 3760.—
2    5,5 Zimmer, 115 m², CHF 2860.—
3     3,5 Zimmer, 74 m², CHF 2165.—
4    5,5 Zimmer, 195 m², CHF 6900.—
Name: rooms_area_price_raw, dtype: object 

0    3.5
1    2.5
2    5.5
3    3.5
4    5.5
Name: rooms, dtype: float64 



### Extract living area

In [6]:
# Extract values from 'rooms_area_price_raw' strings
area = []
for i in df['rooms_area_price_raw']:
    d1 = re.findall('Zimmer, (.*)m²', i)
    try:
        d2 = d1[0].strip()
    except:
        d2 = None
    area.append(d2)

# Save as new variable in the pandas data frame
df['area'] = pd.Series(area, dtype="float64")

# Print first 5 values
print(df['rooms_area_price_raw'].head(5), '\n')
print(df['area'].head(5), '\n')

0    3,5 Zimmer, 122 m², CHF 3180.—
1     2,5 Zimmer, 78 m², CHF 3760.—
2    5,5 Zimmer, 115 m², CHF 2860.—
3     3,5 Zimmer, 74 m², CHF 2165.—
4    5,5 Zimmer, 195 m², CHF 6900.—
Name: rooms_area_price_raw, dtype: object 

0    122.0
1     78.0
2    115.0
3     74.0
4    195.0
Name: area, dtype: float64 



### Extract rental price

In [7]:
# Extract values from 'price_raw' strings
price = []
for i in df['price_raw']:
    d1 = re.findall('[0-9]+', i)
    try:
        d2 = d1[0].strip()
    except:
        d2 = None
    price.append(d2)

# Save as new variable in the pandas data frame
df['price'] = pd.Series(price, dtype="float64")

# Print first 5 values
print(df['price_raw'].head(), '\n')
print(df['price'].head())

0    CHF 3180.—
1    CHF 3760.—
2    CHF 2860.—
3    CHF 2165.—
4    CHF 6900.—
Name: price_raw, dtype: object 

0    3180.0
1    3760.0
2    2860.0
3    2165.0
4    6900.0
Name: price, dtype: float64


## Create additional variables from the apartment's descriptions

### Change strings in 'description_raw' ad 'text_raw' to uppercase 

In [8]:
# Change strings to uppercase 
df['description_raw'] = df['description_raw'].str.upper()
print(df['description_raw'].head(10), '\n')

df['text_raw'] = df['text_raw'].str.upper()
print(df['text_raw'].head(10))

0    «GROSSE GALERIE, TERRASSE MIT PERGOLA, BERG- U...
1              «WUNDERSCHÖNE WOHNUNG IM ENGE-QUARTIER»
2                         «WOHNMOMENTE ZUM FESTHALTEN»
3                                  «3,5 PIÈCES, 74 M²»
4    «WOHNANLAGE IM PARKRING - EXKLUSIVE WOHNUNG ZU...
5     «OHNE MIETKAUTION - SUPER RUHIGE LÄNDLICHE LAGE»
6    «ERSTVERMIETUNG AM ZÜRICHBERG: CHARMANTE 2.5-Z...
7    «ERSTVERMIETUNG NACH TOTAL-SANIERUNG: 3-ZIMMER...
8              «NEUES ZUHAUSE FÜR SIE UND IHRE KINDER»
9       «MODERN, HELL MIT PANORAMASICHT ÜBERS GLATTAL»
Name: description_raw, dtype: object 

0    3,5 ZIMMER, 122 M², CHF 3180.—SUNNENBERGSTRASS...
1    2,5 ZIMMER, 78 M², CHF 3760.—LAVATERSTR. 63, 8...
2    5,5 ZIMMER, 115 M², CHF 2860.—LANGFURRENSTRASS...
3    TOP3,5 ZIMMER, 74 M², CHF 2165.—SANDBUCKWEG 5A...
4    5,5 ZIMMER, 195 M², CHF 6900.—PARKRING 59, 800...
5    2 ZIMMER, 47 M², CHF 1400.—IFANGWEG 1, 8610 US...
6    2,5 ZIMMER, 59 M², CHF 2920.—FLOBOTSTRASSE 2, ...
7    3 ZIMMER, 75 M², CHF 

### Calculate lenght of strings in 'description_raw' and 'text_raw'

In [9]:
# Show first item of variable 'description_raw'
print(df['description_raw'][0])

# Lenght of the strings in 'description_raw'
df['description_raw_len'] = df['description_raw'].str.len()
print(df['description_raw_len'], '\n')

# Show first item of variable 'text_raw'
print(df['text_raw'][0])

# Lenght of the strings in 'text_raw'
df['text_raw_len'] = df['text_raw'].str.len()
print(df['text_raw_len'])

«GROSSE GALERIE, TERRASSE MIT PERGOLA, BERG- UND SEESICHT»
0       58
1       39
2       28
3       19
4       57
        ..
1003    33
1004    33
1005    51
1006    41
1007    58
Name: description_raw_len, Length: 1008, dtype: int64 

3,5 ZIMMER, 122 M², CHF 3180.—SUNNENBERGSTRASSE 15, 8633 WOLFHAUSEN, ZH«GROSSE GALERIE, TERRASSE MIT PERGOLA, BERG- UND SEESICHT»IN WOLFHAUSEN, EINGEBETTET IN DIE SANFTEN HÜGEL ÜBER DEM ZÜRICHSEE VERMIETEN WIR PER 1. OKTOBER 2022 DIESE MAISONETTE-DACHWOHNUNG, WELCHE MIT FOLGENDER AUSSTATTUNG ÜBERZEUGT:GROSSE TERRASSE MIT PERGOLA, SEE-/WEITSICHT UND VIEL SONNENSCHEIN GARANTIERTLICHTDURCHFLUTETE GALERIESCHLAFZIMMER MIT ZUGANG INS BAD MIT DUSCHE UND WCEIN WEITERES SCHLAFZIMMER MIT EINEM EINBAUSCHRANKSEPARATE NASSZELLE MIT ECKBADEWANNE UND WCTOPMODERNE KÜCHE MIT SÄMTLICHEN KOMFORTOFFENES WOHN- UND ESSZIMMERDIE NASSZELLEN UND DIE KÜCHE SIND MIT FEINSTEINZEUGPLATTEN UND DER RE
0       679
1       661
2       650
3       635
4       665
       ... 
1003    645


### Create new binary (0/1) variable 'luxurious'

In [10]:
# Create a pattern which can be used to search the variable 'description_raw'
pattern = '(LOFT) |(SEESICHT)'

# Create new variable 'luxurious' as binary dummy (0/1) variable
df['luxurious'] = df['description_raw'].str.contains(pat = pattern).astype(int)
print(df['luxurious'].sum())

# Show values
df[['description_raw','rooms','area','price','luxurious']]

19


Unnamed: 0,description_raw,rooms,area,price,luxurious
0,"«GROSSE GALERIE, TERRASSE MIT PERGOLA, BERG- U...",3.5,122.0,3180.0,1
1,«WUNDERSCHÖNE WOHNUNG IM ENGE-QUARTIER»,2.5,78.0,3760.0,0
2,«WOHNMOMENTE ZUM FESTHALTEN»,5.5,115.0,2860.0,0
3,"«3,5 PIÈCES, 74 M²»",3.5,74.0,2165.0,0
4,«WOHNANLAGE IM PARKRING - EXKLUSIVE WOHNUNG ZU...,5.5,195.0,6900.0,0
...,...,...,...,...,...
1003,«TRAUMHAFTE LOFTWOHNUNG GESUCHT?»,1.5,65.0,2470.0,0
1004,"«AN SONNIGER LAGE, RICHTUNG WALD»",3.5,70.0,1465.0,0
1005,«IHRE WOHNUNG AN RUHIGER LAGE - BEFRISTETES WO...,2.5,56.0,1870.0,0
1006,«MODERNE 3.5-ZIMMERWOHNUNG IN ALTSTETTEN»,3.5,70.0,2190.0,0


In [11]:
# Create a pattern which can be used to search the variable 'description_raw'
pattern = 'MÖBLIERT'

# Create new variable 'luxurious' as binary dummy (0/1) variable
df['moebliert'] = df['description_raw'].str.contains(pat = pattern).astype(int)
print(df['moebliert'].sum())

# Show values
df[['description_raw','rooms','area','price','luxurious', 'moebliert']]

36


Unnamed: 0,description_raw,rooms,area,price,luxurious,moebliert
0,"«GROSSE GALERIE, TERRASSE MIT PERGOLA, BERG- U...",3.5,122.0,3180.0,1,0
1,«WUNDERSCHÖNE WOHNUNG IM ENGE-QUARTIER»,2.5,78.0,3760.0,0,0
2,«WOHNMOMENTE ZUM FESTHALTEN»,5.5,115.0,2860.0,0,0
3,"«3,5 PIÈCES, 74 M²»",3.5,74.0,2165.0,0,0
4,«WOHNANLAGE IM PARKRING - EXKLUSIVE WOHNUNG ZU...,5.5,195.0,6900.0,0,0
...,...,...,...,...,...,...
1003,«TRAUMHAFTE LOFTWOHNUNG GESUCHT?»,1.5,65.0,2470.0,0,0
1004,"«AN SONNIGER LAGE, RICHTUNG WALD»",3.5,70.0,1465.0,0,0
1005,«IHRE WOHNUNG AN RUHIGER LAGE - BEFRISTETES WO...,2.5,56.0,1870.0,0,0
1006,«MODERNE 3.5-ZIMMERWOHNUNG IN ALTSTETTEN»,3.5,70.0,2190.0,0,0


### Create price_per_m2

In [12]:
df['price_per_m2'] = df['price'] / df['area']
df['price_per_m2']

0       26.065574
1       48.205128
2       24.869565
3       29.256757
4       35.384615
          ...    
1003    38.000000
1004    20.928571
1005    33.392857
1006    31.285714
1007    23.541667
Name: price_per_m2, Length: 1008, dtype: float64

### Create new categorical variable based on apartment area

In [13]:
labels = ['0 - 49', '50 - 99', '100 - 500']

df["area_cat"] = pd.cut(df['area'], bins=[0, 50, 100, 10000], labels=labels)


df[['area', 'area_cat']].head(10)

Unnamed: 0,area,area_cat
0,122.0,100 - 500
1,78.0,50 - 99
2,115.0,100 - 500
3,74.0,50 - 99
4,195.0,100 - 500
5,47.0,0 - 49
6,59.0,50 - 99
7,75.0,50 - 99
8,97.0,50 - 99
9,124.0,100 - 500


In [14]:
labels = ['0 - 19', '20 - 29', '>30']

df["price_cat"] = pd.cut(df['price_per_m2'], bins=[0, 20, 30, 100], labels=labels)
df[['price_per_m2', 'price_cat']].head(10)

Unnamed: 0,price_per_m2,price_cat
0,26.065574,20 - 29
1,48.205128,>30
2,24.869565,20 - 29
3,29.256757,20 - 29
4,35.384615,>30
5,29.787234,20 - 29
6,49.491525,>30
7,40.04,>30
8,18.247423,0 - 19
9,35.967742,>30


### Create new numeric variable 'price_per_m2'

In [15]:
# Create the new variable
df['price_per_m2'] = round(df['price'] / df['area'], 2)

# Show values
df[['description_raw','rooms','area','price','luxurious','price_per_m2']]

Unnamed: 0,description_raw,rooms,area,price,luxurious,price_per_m2
0,"«GROSSE GALERIE, TERRASSE MIT PERGOLA, BERG- U...",3.5,122.0,3180.0,1,26.07
1,«WUNDERSCHÖNE WOHNUNG IM ENGE-QUARTIER»,2.5,78.0,3760.0,0,48.21
2,«WOHNMOMENTE ZUM FESTHALTEN»,5.5,115.0,2860.0,0,24.87
3,"«3,5 PIÈCES, 74 M²»",3.5,74.0,2165.0,0,29.26
4,«WOHNANLAGE IM PARKRING - EXKLUSIVE WOHNUNG ZU...,5.5,195.0,6900.0,0,35.38
...,...,...,...,...,...,...
1003,«TRAUMHAFTE LOFTWOHNUNG GESUCHT?»,1.5,65.0,2470.0,0,38.00
1004,"«AN SONNIGER LAGE, RICHTUNG WALD»",3.5,70.0,1465.0,0,20.93
1005,«IHRE WOHNUNG AN RUHIGER LAGE - BEFRISTETES WO...,2.5,56.0,1870.0,0,33.39
1006,«MODERNE 3.5-ZIMMERWOHNUNG IN ALTSTETTEN»,3.5,70.0,2190.0,0,31.29


### Including current datetime

In [16]:
# Get and format datetime
df['datetime'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# Show values
df[['description_raw','rooms','area','price','luxurious','price_per_m2', 'datetime']]

Unnamed: 0,description_raw,rooms,area,price,luxurious,price_per_m2,datetime
0,"«GROSSE GALERIE, TERRASSE MIT PERGOLA, BERG- U...",3.5,122.0,3180.0,1,26.07,2023-08-21 09:03:47
1,«WUNDERSCHÖNE WOHNUNG IM ENGE-QUARTIER»,2.5,78.0,3760.0,0,48.21,2023-08-21 09:03:47
2,«WOHNMOMENTE ZUM FESTHALTEN»,5.5,115.0,2860.0,0,24.87,2023-08-21 09:03:47
3,"«3,5 PIÈCES, 74 M²»",3.5,74.0,2165.0,0,29.26,2023-08-21 09:03:47
4,«WOHNANLAGE IM PARKRING - EXKLUSIVE WOHNUNG ZU...,5.5,195.0,6900.0,0,35.38,2023-08-21 09:03:47
...,...,...,...,...,...,...,...
1003,«TRAUMHAFTE LOFTWOHNUNG GESUCHT?»,1.5,65.0,2470.0,0,38.00,2023-08-21 09:03:47
1004,"«AN SONNIGER LAGE, RICHTUNG WALD»",3.5,70.0,1465.0,0,20.93,2023-08-21 09:03:47
1005,«IHRE WOHNUNG AN RUHIGER LAGE - BEFRISTETES WO...,2.5,56.0,1870.0,0,33.39,2023-08-21 09:03:47
1006,«MODERNE 3.5-ZIMMERWOHNUNG IN ALTSTETTEN»,3.5,70.0,2190.0,0,31.29,2023-08-21 09:03:47


## Count, identify and remove missing values

In [17]:
# Count missing values
print('Count missing values per variable')
print(pd.isna(df).sum(), '\n')

# Identify rows with missing values
print('Identify rows with missing values')
print(df.loc[df.isna().any(axis=1)][['rooms', 'area', 'price']], '\n')

# Drop rows where at least one element is missing.
df2 = df.dropna()
df2.head()

Count missing values per variable
web-scraper-order         0
web-scraper-start-url     0
rooms_area_price_raw      0
address_raw               0
price_raw                 0
description_raw           0
text_raw                  0
rooms                    10
area                     86
price                     8
description_raw_len       0
text_raw_len              0
luxurious                 0
moebliert                 0
price_per_m2             92
area_cat                 86
price_cat                94
datetime                  0
dtype: int64 

Identify rows with missing values
     rooms  area   price
36     5.0   NaN  2495.0
45     3.0   NaN  1250.0
50     3.0   NaN  1850.0
58     2.0   NaN  1700.0
67     NaN   NaN  1500.0
..     ...   ...     ...
950    5.5   NaN  2100.0
952    4.5   NaN  1900.0
958    2.5  67.0     NaN
979    3.5  86.0     NaN
991    4.5   NaN  1650.0

[94 rows x 3 columns] 



Unnamed: 0,web-scraper-order,web-scraper-start-url,rooms_area_price_raw,address_raw,price_raw,description_raw,text_raw,rooms,area,price,description_raw_len,text_raw_len,luxurious,moebliert,price_per_m2,area_cat,price_cat,datetime
0,1662023695-433,https://www.immoscout24.ch/de/wohnung/mieten/k...,"3,5 Zimmer, 122 m², CHF 3180.—","Sunnenbergstrasse 15, 8633 Wolfhausen, ZH",CHF 3180.—,"«GROSSE GALERIE, TERRASSE MIT PERGOLA, BERG- U...","3,5 ZIMMER, 122 M², CHF 3180.—SUNNENBERGSTRASS...",3.5,122.0,3180.0,58,679,1,0,26.07,100 - 500,20 - 29,2023-08-21 09:03:47
1,1662023745-820,https://www.immoscout24.ch/de/wohnung/mieten/k...,"2,5 Zimmer, 78 m², CHF 3760.—","Lavaterstr. 63, 8002 Zürich, ZH",CHF 3760.—,«WUNDERSCHÖNE WOHNUNG IM ENGE-QUARTIER»,"2,5 ZIMMER, 78 M², CHF 3760.—LAVATERSTR. 63, 8...",2.5,78.0,3760.0,39,661,0,0,48.21,50 - 99,>30,2023-08-21 09:03:47
2,1662023742-807,https://www.immoscout24.ch/de/wohnung/mieten/k...,"5,5 Zimmer, 115 m², CHF 2860.—","Langfurrenstrasse 5c, 8623 Wetzikon ZH, ZH",CHF 2860.—,«WOHNMOMENTE ZUM FESTHALTEN»,"5,5 ZIMMER, 115 M², CHF 2860.—LANGFURRENSTRASS...",5.5,115.0,2860.0,28,650,0,0,24.87,100 - 500,20 - 29,2023-08-21 09:03:47
3,1662023804-1290,https://www.immoscout24.ch/de/wohnung/mieten/k...,"3,5 Zimmer, 74 m², CHF 2165.—","Sandbuckweg 5A, 8157 Dielsdorf, ZH",CHF 2165.—,"«3,5 PIÈCES, 74 M²»","TOP3,5 ZIMMER, 74 M², CHF 2165.—SANDBUCKWEG 5A...",3.5,74.0,2165.0,19,635,0,0,29.26,50 - 99,20 - 29,2023-08-21 09:03:47
4,1662023739-771,https://www.immoscout24.ch/de/wohnung/mieten/k...,"5,5 Zimmer, 195 m², CHF 6900.—","Parkring 59, 8002 Zürich, ZH",CHF 6900.—,«WOHNANLAGE IM PARKRING - EXKLUSIVE WOHNUNG ZU...,"5,5 ZIMMER, 195 M², CHF 6900.—PARKRING 59, 800...",5.5,195.0,6900.0,57,665,0,0,35.38,100 - 500,>30,2023-08-21 09:03:47


## Count, identify & remove duplicated values

In [18]:
# Count duplicated values in the whole data set
print('Sum of missing values:', df.duplicated().sum(), '\n')

# Identify duplicated values in 'rooms', 'area', 'price'
print('Duplicated values')
print(df.loc[df.duplicated(keep = 'last')])

# Drop the rows with duplicated values
df3 = df2.drop_duplicates()

Sum of missing values: 0 

Duplicated values
Empty DataFrame
Columns: [web-scraper-order, web-scraper-start-url, rooms_area_price_raw, address_raw, price_raw, description_raw, text_raw, rooms, area, price, description_raw_len, text_raw_len, luxurious, moebliert, price_per_m2, area_cat, price_cat, datetime]
Index: []


### Save data to file

In [19]:
df3.to_csv('apartments_data_prepared.csv', 
          sep=",", 
          encoding='utf-8',
          index=False)

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [20]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Linux | 5.15.0-1041-azure
Datetime: 2023-08-21 09:03:47
Python Version: 3.10.8
-----------------------------------
