# Advanced Data Preparation with Python (Apartment Data)

## Libraries and settings

In [1]:
# Libraries
import os
import re
import time
import fnmatch
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Get current working directory
print(os.getcwd())

/workspaces/python_data_preparation


## Importing data

In [2]:
# Read the data to a pandas data frame
df = pd.read_csv('./Data/apartments_data_zuerich.csv', 
                 sep=',', 
                 encoding='utf-8')

# Show first records of data frame
df.head()

Unnamed: 0,web-scraper-order,web-scraper-start-url,rooms_area_price_raw,address_raw,price_raw,description_raw,text_raw
0,1693998201-1,https://www.immoscout24.ch/de/immobilien/miete...,"3 Zimmer, 49 m², CHF 1441.—","Neuhusstrasse 6, 8630 Rüti ZH, ZH",CHF 1441.—,«Gemütliche Wohnung im Grünen»,"3 Zimmer, 49 m², CHF 1441.—Neuhusstrasse 6, 86..."
1,1693998201-2,https://www.immoscout24.ch/de/immobilien/miete...,"3,5 Zimmer, 65 m², CHF 1850.—","Zürcherstrasse 1, 8173 Neerach, ZH",CHF 1850.—,«Attraktive 3.5-Zimmer-EG-Wohnung in Neerach»,"3,5 Zimmer, 65 m², CHF 1850.—Zürcherstrasse 1,..."
2,1693998201-3,https://www.immoscout24.ch/de/immobilien/miete...,"19 m², CHF 2686.—","Cramerstrasse 8-12, 8004 Zürich, ZH",CHF 2686.—,«Studio Apartment Junior Balcony»,"19 m², CHF 2686.—Cramerstrasse 8-12, 8004 Züri..."
3,1693998201-4,https://www.immoscout24.ch/de/immobilien/miete...,"2 Zimmer, 54 m², CHF 4853.—","Cramerstrasse 8-12, 8004 Zürich, ZH",CHF 4853.—,«2 Bedroom Apartment Senior Balcony»,"2 Zimmer, 54 m², CHF 4853.—Cramerstrasse 8-12,..."
4,1693998201-5,https://www.immoscout24.ch/de/immobilien/miete...,"2 Zimmer, 49 m², CHF 4335.—","Rotachstrasse 33, 8003 Zürich, ZH",CHF 4335.—,«2 Bedroom Apartment Junior Terrace»,"2 Zimmer, 49 m², CHF 4335.—Rotachstrasse 33, 8..."


## Count number of rows and columns in the data frame

In [3]:
# Dimension (rows, columns)
print('Dimension:', df.shape)

# Number of rows
print('Number of rows:', df.shape[0])

# Number of columns
### dh.shape scheint ein Array (2) zu sein
print('Number of columns:', df.shape[1])

Dimension: (1008, 7)
Number of rows: 1008
Number of columns: 7


## Get data types (raw-format from web scraper)

In [4]:
# Get data types (note that in pandas, a string is referred to as 'object')
df.dtypes

web-scraper-order        object
web-scraper-start-url    object
rooms_area_price_raw     object
address_raw              object
price_raw                object
description_raw          object
text_raw                 object
dtype: object

## Extract information from raw data using regular expressions (regex)

### Extract number of rooms

In [5]:
# Extract values from 'rooms_area_price_raw' strings
### Regex expression, revise this! 
rooms = []
for i in df['rooms_area_price_raw']:
    d1 = re.findall("(\d+,\d+|\d+)\s*Zimmer", i)
    try:
        ### strip() removes leading and trailing whitespaces, 
        d2 = d1[0].strip().replace(',', '.')
    except:
        d2 = None
    rooms.append(d2)

# Save as new variable in the pandas data frame
df['rooms'] = pd.Series(rooms, dtype="float64")
    
# Show first records of data frame
df[['rooms_area_price_raw', 'rooms']].head()


Unnamed: 0,rooms_area_price_raw,rooms
0,"3 Zimmer, 49 m², CHF 1441.—",3.0
1,"3,5 Zimmer, 65 m², CHF 1850.—",3.5
2,"19 m², CHF 2686.—",
3,"2 Zimmer, 54 m², CHF 4853.—",2.0
4,"2 Zimmer, 49 m², CHF 4335.—",2.0


### Extract living area

In [6]:
# Extract values from 'rooms_area_price_raw' strings
area = []
for i in df['rooms_area_price_raw']:
    d1 = re.findall("(\d+)\s*m²", i)
    try:
        d2 = d1[0].strip()
    except:
        d2 = None
    area.append(d2)

# Save as new variable in the pandas data frame
df['area'] = pd.Series(area, dtype="Int64")

# Show first records of data frame
df[['rooms_area_price_raw', 'area']].head()

Unnamed: 0,rooms_area_price_raw,area
0,"3 Zimmer, 49 m², CHF 1441.—",49
1,"3,5 Zimmer, 65 m², CHF 1850.—",65
2,"19 m², CHF 2686.—",19
3,"2 Zimmer, 54 m², CHF 4853.—",54
4,"2 Zimmer, 49 m², CHF 4335.—",49


### Extract rental price

In [7]:
# Extract values from 'price_raw' strings
price = []
for i in df['price_raw']:
    d1 = re.findall("CHF\s*(\d+)\.—", i)
    try:
        d2 = d1[0].strip()
    except:
        d2 = None
    price.append(d2)

# Save as new variable in the pandas data frame
df['price'] = pd.Series(price, dtype="Int64")

# Show first records of data frame
df[['rooms_area_price_raw', 'price']].head()

Unnamed: 0,rooms_area_price_raw,price
0,"3 Zimmer, 49 m², CHF 1441.—",1441
1,"3,5 Zimmer, 65 m², CHF 1850.—",1850
2,"19 m², CHF 2686.—",2686
3,"2 Zimmer, 54 m², CHF 4853.—",4853
4,"2 Zimmer, 49 m², CHF 4335.—",4335


### Get data types of all variables, including the new once

In [8]:
df.dtypes

web-scraper-order         object
web-scraper-start-url     object
rooms_area_price_raw      object
address_raw               object
price_raw                 object
description_raw           object
text_raw                  object
rooms                    float64
area                       Int64
price                      Int64
dtype: object

## Count and remove missing values

In [9]:
# Count missing values
print(f"Number of missing values: {pd.isna(df).sum().sum()}")

# Drop rows with missing values (if any)
df = df.dropna().reset_index(drop=True)

Number of missing values: 157


## Count and remove duplicated values

In [None]:
# Count duplicated values
print(f"Number of duplicated values: {df[df[['web-scraper-order']].duplicated()].sum().sum()}")

# Drop rows with duplicated values (if any)
df = df.drop_duplicates().reset_index(drop=True)

# Dimension (rows, columns)
print('\nDimension:', df.shape)

## Use string manipulation methods to create additional variables from the apartment descriptions.

### Change strings in 'description_raw' ad 'text_raw' to uppercase 

In [None]:
# Change strings in 'description_raw' to uppercase 
df['description_raw'] = df['description_raw'].str.upper()
print('Description:\n', df['description_raw'].head(5), '\n')

# Change strings in 'text_raw' to uppercase 
df['text_raw'] = df['text_raw'].str.upper()
print('Text:\n', df['text_raw'].head(5))

### Calculate length of strings in 'description_raw' and 'text_raw'

In [None]:
# Length of the strings in 'description_raw'
df['description_raw_len'] = df['description_raw'].str.len()

# Length of the strings in 'text_raw'
df['text_raw_len'] = df['text_raw'].str.len()

# Histogram of the length of the strings in 'description_raw' and 'text_raw'
ax = df[['description_raw_len', 'text_raw_len']].hist(bins=25, 
                                                      figsize=(7, 3), 
                                                      color='steelblue')

# Set the y-axis range for each subplot
for axis in ax.flatten():
    axis.set_ylim(0, 300)

# Display the plot
plt.show()

### Create binary (0/1) variable 'luxurious'

In [None]:
# Create a pattern which can be used to search the variable 'description_raw'
pattern = 'LOFT|SEESICHT|PENTHOUSE|LUXUS'

# Create new variable 'luxurious' as binary dummy (0/1) variable
df['luxurious'] = df['description_raw'].str.contains(pat = pattern).astype(int)
print(df['luxurious'].sum())

# Show values
df[['description_raw','rooms','area','price','luxurious']].loc[df['luxurious'] == 1]

### Create variable 'price_per_m2'

In [None]:
# Create the new variable
df['price_per_m2'] = round(df['price'] / df['area'], 2)

# Show values
df[['description_raw','rooms','area','price','luxurious','price_per_m2']]

### Include current datetime as time stamp

In [None]:
# Get and format datetime
df['datetime'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# Show values
df[['description_raw','rooms','area','price','luxurious','price_per_m2', 'datetime']]

## Discretization of numerical data

In [None]:
# Discretization of the variable 'price_per_m2'
df['price_per_m2_cat'] = pd.cut(df['price_per_m2'], 
                                bins = 5, 
                                labels = ['very cheap', 
                                            'cheap', 
                                            'normal', 
                                            'expensive', 
                                            'very expensive'])

# Show unique values
df['price_per_m2_cat'].unique()

## One Hot Encoding

In [None]:
# One Hot Encoding of the variable 'price_per_m2_cat'
try:
    df = pd.get_dummies(df, 
                        columns=['price_per_m2_cat'], 
                        drop_first=False)
except:
    print("Dummy variables already exist")

# Convert boolean values to integers (0 and 1)
df[df.filter(like='price_per_m2_cat').columns] = df.filter(like='price_per_m2_cat').astype(int)

# Show values of the new dummy variables
df.filter(like='price_per_m2_cat').head()

## Scaling

### Min-Max Scaling

In [None]:
# Min-Max scaling of the variable 'area'
df['area_scaled_minmax'] = (df['area'] - df['area'].min()) / (df['area'].max() - df['area'].min())

# Histogram of the variable 'area' and 'area_scaled'
ax = df[['area', 'area_scaled_minmax']].hist(bins=25, 
                                      figsize=(7, 3), 
                                      color='steelblue')
                                      
# Set the y-axis range for each subplot and add labels
for axis, label in zip(ax.flatten(), ['Area', 'Min_Max_Scaled(Area)']):
    axis.set_ylim(0, 500)
    axis.set_xlabel(label)
    axis.set_ylabel('Frequency')

# Display the plot
plt.show()

### Max-Absolute Scaling

In [None]:
# Max-Absolute scaling of the variable 'area'
df['area_scaled_max_abs'] = df['area'] / df['area'].abs().max()

# Histogram of the variable 'area' and 'area_scaled_max_abs'
ax = df[['area', 'area_scaled_max_abs']].hist(bins=25, 
                                              figsize=(7, 3), 
                                              color='steelblue')

# Set the y-axis range for each subplot and add labels
for axis, label in zip(ax.flatten(), ['Area', 'Max_Abs_Scaled(Area)']):
    axis.set_ylim(0, 500)
    axis.set_xlabel(label)
    axis.set_ylabel('Frequency')

# Display the plot
plt.show()

### Robust Scaling

In [None]:
# Robust scaling of the variable 'area'
df['area_scaled_robust'] = (df['area'] - df['area'].median()) / \
                           (df['area'].quantile(0.75) \
                           - df['area'].quantile(0.25))

# Histogram of the variable 'area' and 'area_scaled_robust'
ax = df[['area', 'area_scaled_robust']].hist(bins=25, 
                                             figsize=(7, 3), 
                                             color='steelblue')

# Set the y-axis range for each subplot and add labels
for axis, label in zip(ax.flatten(), ['Area', 'Scaled_Robust(Area)']):
    axis.set_ylim(0, 500)
    axis.set_xlabel(label)
    axis.set_ylabel('Frequency')

# Display the plot
plt.show()

## Standardization

### Z-score Normalization

In [None]:
# Z-score Normalization of the variable 'area'
df['area_scaled_z'] = (df['area'] - df['area'].mean()) / df['area'].std()

# Histogram of the variable 'area' and 'area_scaled_z'
ax = df[['area', 'area_scaled_z']].hist(bins=25, 
                                        figsize=(7, 3), 
                                        color='steelblue')

# Set the y-axis range for each subplot and add labels
for axis, label in zip(ax.flatten(), ['Area', 'Scaled_Z(Area)']):
    axis.set_ylim(0, 500)
    axis.set_xlabel(label)
    axis.set_ylabel('Frequency')

# Display the plot
plt.show()

## Transformation

### Log-Transformation

In [None]:
# Log-Transformation of the variable 'area'
df['area_log'] = np.log(df['area'])

# Histogram of the variable 'area' and 'area_log'
ax = df[['area', 'area_log']].hist(bins=25, 
                                   figsize=(7, 3), 
                                   color='steelblue')

# Set the y-axis range for each subplot and add labels
for axis, label in zip(ax.flatten(), ['Area', 'Log(Area)']):
    axis.set_ylim(0, 500)
    axis.set_xlabel(label)
    axis.set_ylabel('Frequency')

# Display the plot
plt.show()

### SQRT-Transformation

In [None]:
# SQRT-Transformation of the variable 'area'
df['area_sqrt'] = np.sqrt(df['area'])

# Histogram of the variable 'area' and 'area_sqrt'
ax = df[['area', 'area_sqrt']].hist(bins=25, 
                                    figsize=(7, 3), 
                                    color='steelblue')

# Set the y-axis range for each subplot and add labels
for axis, label in zip(ax.flatten(), ['Area', 'SQRT(Area)']):
    axis.set_ylim(0, 500)
    axis.set_xlabel(label)
    axis.set_ylabel('Frequency')

# Display the plot
plt.show()

### Box-Cox Transformation

In [None]:
# Box-Cox Transformation of the variable 'area'
from scipy.stats import boxcox

# Apply Box-Cox transformation
df['area_boxcox'], _ = boxcox(df['area'])

# Histogram of the variable 'area' and 'area_boxcox'
ax = df[['area', 'area_boxcox']].hist(bins=25, 
                                      figsize=(7, 3), 
                                      color='steelblue')

# Set the y-axis range for each subplot and add labels
for axis, label in zip(ax.flatten(), ['Area', 'Box-Cox(Area)']):
    axis.set_ylim(0, 500)
    axis.set_xlabel(label)
    axis.set_ylabel('Frequency')

# Display the plot
plt.show()

## Combining & organizing data

### Reading rental apartment data with geocoded addresses

In [None]:
# Meaning of variables
# lat: geographical latitude
# lon: geographical longitude
# bfs_number: official municipality id
# bfs_name: official municipality name

# Geocoded data (i.e. data with latitude and longitude)
df_geo = pd.read_csv('./Data/apartments_data_geocoded.csv', 
                     sep=';', 
                     encoding='utf-8')

# Show data
df_geo.head()

### Join geo-information to rental apartment data using .merge()

In [None]:
df2 = df.merge(df_geo[['web-scraper-order', 
                       'lat', 
                       'lon', 
                       'bfs_number', 
                       'bfs_name']], 
               on="web-scraper-order")

# Show data
df2[['web-scraper-order',
     'address_raw',
     'price',
     'price_per_m2',
     'area',
     'rooms',
     'lat', 
     'lon', 
     'bfs_number', 
     'bfs_name']].head()

### Reading municipality-level data

In [None]:
# Meaning of variables:
# bfs_number: municipality id
# bfs_name: municipality name
# pop: number of residents
# pop_dens: population density per km2
# frg_pct: percentage foreigners
# emp: number of employees

df_municip = pd.read_excel('./Data/municipality_data.xlsx', 
                           sheet_name='data_for_import')
df_municip.head(5)

### Join municipality data to rental apartment data using .merge()

In [None]:
# Merge needs a key which must be identical in both data sets (here the key is 'bfs_number')
df3 = df2.merge(df_municip[['bfs_number', 
                            'pop', 
                            'pop_dens', 
                            'frg_pct', 
                            'emp']], 
                on="bfs_number")

# Show data
df3[['web-scraper-order',
     'address_raw',
     'price',
     'price_per_m2',
     'area',
     'rooms',
     'lat', 
     'lon', 
     'bfs_number', 
     'bfs_name',
     'pop',	
     'pop_dens']].head()

## Sorting data

In [None]:
# Sorting data by 'price' and 'area' with highest price above (ascending=False)
df3 = df3.sort_values(by=['price', 'area'], 
                ascending=False)

# Show data
df3[['web-scraper-order',
     'address_raw',
     'price',
     'price_per_m2',
     'area',
     'rooms',
     'lat', 
     'lon', 
     'bfs_number', 
     'bfs_name',
     'pop',	
     'pop_dens']].head()

## Reshaping data

### Reshaping data using .stack() and .unstack()

In [None]:
df_sub = df3[['bfs_name', 'rooms', 'price', 'area']][:5]
print('Original shape')
print(df_sub, '\n')

df_sub_stacked = df_sub.stack()
print('Stacked')
print(df_sub_stacked, '\n')

# Using unstack
print('Unstacked (= back to original shape)')
print(df_sub_stacked.unstack())

### Reshaping data using .melt()

In [None]:
df_sub = df3[['rooms', 'price', 'area']][:5]
print('Original shape')
print(df_sub, '\n')

print('Reshaped using .melt()')
print(pd.melt(df, id_vars=['rooms'], value_vars=['price', 'area']))

### Pivoting data using .pivot_table()

In [None]:
pivot_table = pd.pivot_table(df3[['rooms', 'price', 'price_per_m2', 'area']],
                             index=['rooms'],
                             values={'price': np.mean, 
                                     'price_per_m2': np.mean, 
                                     'area': np.mean}).round(2)

# Format the values to 2 decimal places with trailing zeros
pivot_table = pivot_table.applymap(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)

# Display the pivot table
pivot_table

### Jupyter notebook --footer info-- (please always provide this at the end of each notebook)

In [None]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')