# Data Exploration and Cleaning: Transactions

In [1]:
import numpy as np
import pandas as pd
from helpers import DATA_DIR, RAW_DATA_DIR

import plotly.express as px

## Exploration

In [2]:
df_transactions = pd.read_parquet(RAW_DATA_DIR / 'transactions_KL.parquet')
df_transactions

Unnamed: 0,project_name,spa_date,address,building_type,tenure,floors,rooms,land_area,built_up,price_psf,price
0,BANDAR BARU SRI PETALING,09/06/2023,"✕✕✕, JALAN PIKRAMA",TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,1,,"2,196 ft²",,342,750000
1,BANDAR BARU SRI PETALING,01/06/2023,"✕✕. ✕✕, JALAN PERLAK 3",TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753 ft²,,398,300000
2,BANDAR BARU SRI PETALING,29/05/2023,"✕✕ ✕, JALAN 12/149L",TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2½,,"3,197 ft²",,188,600000
3,BANDAR BARU SRI PETALING,25/05/2023,"✕✕. ✕✕✕, JALAN PASAI",TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753 ft²,,531,400000
4,BANDAR BARU SRI PETALING,22/05/2023,"✕✕, JALAN SRI PETALING 5",SEMI-D,LEASEHOLD,2½,,"4,801 ft²",,250,1200000
...,...,...,...,...,...,...,...,...,...,...,...
294562,HERITAGE STATION HOTEL,13/11/1990,"✕✕✕-✕✕✕, BB WANGSA MAJU",FLAT,LEASEHOLD,1,2,493 ft²,493 ft²,71,35000
294563,IDAMAN PUTERI,10/01/2005,"✕✕-✕, JALAN GOMBAK",CONDOMINIUM,FREEHOLD,1,3,1454 ft²,1454 ft²,150,218025
294564,KELAB LE CHATEAU II,25/02/2008,"✕-✕✕-✕, JALAN KIARA 3",CONDOMINIUM,FREEHOLD,1,3,593 ft²,593 ft²,194,115000
294565,MUTIARA SENTUL CONDOMINIUM,10/08/2009,"✕-✕-✕, OFF JALAN SENTUL",APARTMENT,LEASEHOLD,1,2,1193 ft²,1193 ft²,197,235000


In [3]:
df_transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294567 entries, 0 to 294566
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   project_name   294567 non-null  object
 1   spa_date       294567 non-null  object
 2   address        294546 non-null  object
 3   building_type  294567 non-null  object
 4   tenure         294567 non-null  object
 5   floors         294567 non-null  object
 6   rooms          294567 non-null  object
 7   land_area      294567 non-null  object
 8   built_up       294567 non-null  object
 9   price_psf      294567 non-null  object
 10  price          294567 non-null  object
dtypes: object(11)
memory usage: 24.7+ MB


At a glance:
1. `project_name` column contains strings. No missing values.
2. `spa_date` is detected as strings instead of date, need to change to date type. No missing values. 
3. `address` column contain strings. There are missing values.
4. `building_type` column contains strings. No missing values.
5. `tenure` column contains strings. No missing values.
6. `floors` is detected as strings instead of float, need to change to float type. No missing values.
7. `rooms` is automatically detected as float. There are 'nan' but not detected as missing values.
8. `land_area` is detected as strings instead of float, need to change to float type. No missing values.
9. `built_up` is detected as strings instead of float, need to change to float type. There are missing values.
10. `price` is detected as strings instead of float, need to change to float type. No missing values.

In [4]:
df_transactions['spa_date'] = pd.to_datetime(df_transactions['spa_date'], format='%d/%m/%Y')
df_transactions['spa_date']

0        2023-06-09
1        2023-06-01
2        2023-05-29
3        2023-05-25
4        2023-05-22
            ...    
294562   1990-11-13
294563   2005-01-10
294564   2008-02-25
294565   2009-08-10
294566   1995-08-18
Name: spa_date, Length: 294567, dtype: datetime64[ns]

In [5]:
df_transactions['spa_date'].sort_values()

56134    1909-05-11
128655   1933-10-25
262130   1955-07-25
64404    1959-07-27
266269   1960-11-14
            ...    
224137   2023-06-09
266662   2023-06-13
87238    2023-06-13
214691   2023-06-14
125442   2023-06-19
Name: spa_date, Length: 294567, dtype: datetime64[ns]

The data type of the `spa_date` column has been changed to datetime type.

In [6]:
print(f"% missing values: {df_transactions['address'].isnull().sum() / len(df_transactions) * 100}")
df_transactions[df_transactions['address'].isnull()]

% missing values: 0.0071291081485706145


Unnamed: 0,project_name,spa_date,address,building_type,tenure,floors,rooms,land_area,built_up,price_psf,price
18211,ALAM DAMAI,1999-02-15,,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,3.0,"1,647 ft²","1,087 ft²",107,175750
27824,TAMAN SRI HARTAMAS,1996-01-10,,TERRACE HOUSE - INTERMEDIATE,FREEHOLD,2,3.0,"1,650 ft²","1,671 ft²",262,432000
70435,KAMPUNG BATU MUDA,2000-06-06,,BUNGALOW,LEASEHOLD,1,3.0,"4,004 ft²",834 ft²,30,120000
71295,LUCKY GARDEN (JALAN PUCHONG),1997-04-21,,TERRACE HOUSE - INTERMEDIATE,FREEHOLD,1,3.0,"1,754 ft²",869 ft²,114,200000
71296,LUCKY GARDEN (JALAN PUCHONG),1997-04-21,,TERRACE HOUSE - INTERMEDIATE,FREEHOLD,1,3.0,"1,775 ft²",869 ft²,113,200000
74962,SRI BINTANG HEIGHTS,1998-04-29,,TERRACE HOUSE - INTERMEDIATE,FREEHOLD,2,3.0,"1,787 ft²","1,323 ft²",123,220000
75638,LAMAN RESIDENCE,2008-09-15,,SEMI-D,LEASEHOLD,2,4.0,"3,057 ft²","2,485 ft²",212,648000
102998,HAPPY GARDEN FLAT,1999-09-07,,FLAT,FREEHOLD,1,3.0,708 ft²,708 ft²,132,93500
112720,TAMAN BUKIT CHERAS,1998-03-20,,FLAT,FREEHOLD,1,,734 ft²,734 ft²,109,80000
147816,SALAK SOUTH GARDEN,2002-10-29,,FLAT,FREEHOLD,1,2.0,625 ft²,625 ft²,56,35000


We can drop the `address` column based on the following reasons:
1. No detailed locational scope is considered in this study
2. Only high-level geographical scope is considered

In [7]:
df_transactions['building_type'].value_counts()

building_type
CONDOMINIUM                      99778
TERRACE HOUSE - INTERMEDIATE     47865
APARTMENT                        45264
FLAT                             44960
SERVICE RESIDENCE                27787
BUNGALOW                          7871
SEMI-D                            5824
TOWN HOUSE                        4983
TERRACE HOUSE - CORNER LOT        3946
TERRACE HOUSE - END LOT           2966
CLUSTER HOUSE - INTERMEDIATE      2588
CLUSTER HOUSE                      506
CLUSTER HOUSE - CORNER LOT         128
CLUSTER HOUSE - END LOT             99
CONDOMINIUM - CLUB                   1
CONDOMINIUM - RESIDENTIAL USE        1
Name: count, dtype: int64

Findings:
1. Majority of the transactions are condominium, followed by intermediate terrace house and apartment
2. Cluster house has an unknown category which does not fall into intermediate, corner lot or end lot.
3. Condominium has a weird category called "residential use", but only has one transaction

In [8]:
print(f"% Freehold: {df_transactions['tenure'].value_counts()['FREEHOLD'] / len(df_transactions) * 100}")
df_transactions['tenure'].value_counts()

% Freehold: 61.22987300003055


tenure
FREEHOLD     180363
LEASEHOLD    114204
Name: count, dtype: int64

Findings:
1. Majority of the transactions are freehold (61.23%)

In [9]:
df_transactions['floors'].value_counts()

floors
1     232497
2      44132
3      11788
2½      3059
0       1175
4        967
3½       264
1½       256
6        154
5        150
8         41
4½        18
9         17
20        15
10        11
7          7
11         4
13         2
15         2
99         2
24         2
46         1
12         1
35         1
21         1
Name: count, dtype: int64

In [10]:
print(df_transactions['floors'].isnull().sum())
print((df_transactions['floors'] == 'nan').sum())

0
0


Findings:
1. Half a floor are represented by 1/2 instead of .5
2. There are weird values like:
    - 0 floor
    - 99 floors
3. Majority of floors are 2 floors
4. No missing values

In [11]:
df_transactions['rooms'].value_counts()

rooms
3       104800
3.0      56962
2        30944
nan      30670
4.0      20333
         ...  
31           1
17.0         1
24.0         1
18.0         1
148          1
Name: count, Length: 70, dtype: int64

In [12]:
(df_transactions['rooms'] == 'nan').sum() / len(df_transactions) * 100

10.411892710317177

Findings:
1. Weird number of rooms:
    - 0 room
    - 46 rooms
2. Majority of the transactions are 3 rooms
3. 10.45% of the transactions have NaN

In [13]:
df_transactions['land_area'].value_counts()

land_area
1,647 ft²     3652
883 ft²       3218
1,539 ft²     2388
592 ft²       2146
1,540 ft²     1882
              ... 
10,807 ft²       1
2,202 ft²        1
25,306 ft²       1
4,690 ft²        1
5593 ft²         1
Name: count, Length: 9147, dtype: int64

In [14]:
df_transactions['land_area'].isnull().sum() / len(df_transactions) * 100

0.0

Findings:
1. There are "ft2" and "," in land_area. Need to remove.
2. No missing values

In [15]:
df_transactions['built_up'].value_counts()

built_up
nan          5613
861 ft²      1849
1000 ft²     1670
549 ft²      1452
556 ft²      1445
             ... 
1,031 ft²       1
113 ft²         1
5,414 ft²       1
5197 ft²        1
5593 ft²        1
Name: count, Length: 7610, dtype: int64

In [16]:
(df_transactions['built_up'] == 'nan').sum() / len(df_transactions) * 100

1.9055087637108028

Findings:
1. 1.91% of the data is missing. Need further investigation
2. There are "ft2" and "," in the data. Need to remove them

In [17]:
df_transactions['price_psf'].value_counts()

price_psf
152      1246
155      1241
71       1123
139      1104
143      1092
         ... 
1,645       1
1,980       1
3,099       1
1,578       1
2,326       1
Name: count, Length: 2144, dtype: int64

In [18]:
df_transactions['price_psf'].isna().sum() / len(df_transactions) * 100

0.0

Findings:
1. No missing values
2. Have "," in the numbers. Need to remove them

In [19]:
df_transactions['price'].value_counts()

price
200,000    3603
150,000    3587
300,000    3511
250,000    3382
500,000    3181
           ... 
101,800       1
123,620       1
73,360        1
554,338       1
518,296       1
Name: count, Length: 18778, dtype: int64

In [20]:
df_transactions['price'].isna().sum() / len(df_transactions) * 100

0.0

Findings:
1. No missing data.
2. Have "," in the data. Need to remove.

### Concluding Remarks
1. How does each dataset look like? Is it in row format or it has merged cells?
    - It is in row format, with no merged cells.
2. What is the start and end of each dataset?
    - From 1909 to 2023
3. Is the dataset from the area of interest, Kuala Lumpur?
    - Yes
4. How is the data quality?
    - There are missing values in some columns (`built_up` and `rooms`). Need to decide to remove rows or impute.
    - Values in some columns (`land_area` and `built_up`) has unit of measurement like "ft2". Need to remove.
    - Values in `floors` column uses fraction instead of decimal. Need to change.
    - Values in numerical columns (`land_area`, `built_up`, `price_psf` and `price`) has comma. Need to remove.
    - Some values in some columns (`floors` and `rooms`) does not make sense. Need further investigation.
    - Only landed properties are in the dataset. Need another round of scraping for high rise real estate.

## Preparation

Selection:
1. Drop `address` column

Cleaning:
1. Change fraction in `floors` column, i.e. 1/2 to .5
2. Remove comma in `price_psf`
3. Remove comma in `price`
4. Remove comma and ft2 in `land_area` and `built_up`
5. Remove 'acre' in `land_area` and convert into sqft
6. Investigate missing values in `built_up` and remove/impute
7. For `rooms` column:
    - Investigate missing values in `rooms` and remove/impute
    - Investigate weird `rooms` values (0 and 46) and remove/impute
8. Investigate weird `floors` values (0 and 99) and remove/impute

### Remove address column

In [21]:
# Removing address column
df_transactions = df_transactions.drop(columns=['address'])
df_transactions

Unnamed: 0,project_name,spa_date,building_type,tenure,floors,rooms,land_area,built_up,price_psf,price
0,BANDAR BARU SRI PETALING,2023-06-09,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,1,,"2,196 ft²",,342,750000
1,BANDAR BARU SRI PETALING,2023-06-01,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753 ft²,,398,300000
2,BANDAR BARU SRI PETALING,2023-05-29,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2½,,"3,197 ft²",,188,600000
3,BANDAR BARU SRI PETALING,2023-05-25,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753 ft²,,531,400000
4,BANDAR BARU SRI PETALING,2023-05-22,SEMI-D,LEASEHOLD,2½,,"4,801 ft²",,250,1200000
...,...,...,...,...,...,...,...,...,...,...
294562,HERITAGE STATION HOTEL,1990-11-13,FLAT,LEASEHOLD,1,2,493 ft²,493 ft²,71,35000
294563,IDAMAN PUTERI,2005-01-10,CONDOMINIUM,FREEHOLD,1,3,1454 ft²,1454 ft²,150,218025
294564,KELAB LE CHATEAU II,2008-02-25,CONDOMINIUM,FREEHOLD,1,3,593 ft²,593 ft²,194,115000
294565,MUTIARA SENTUL CONDOMINIUM,2009-08-10,APARTMENT,LEASEHOLD,1,2,1193 ft²,1193 ft²,197,235000


### Change fraction to decimal

In [22]:
# Change fraction to decimal
from helpers import convert_mixed_fraction_to_decimal

df_transactions['floors'] = df_transactions['floors'].apply(lambda x: convert_mixed_fraction_to_decimal(x))
df_transactions

Unnamed: 0,project_name,spa_date,building_type,tenure,floors,rooms,land_area,built_up,price_psf,price
0,BANDAR BARU SRI PETALING,2023-06-09,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,1,,"2,196 ft²",,342,750000
1,BANDAR BARU SRI PETALING,2023-06-01,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753 ft²,,398,300000
2,BANDAR BARU SRI PETALING,2023-05-29,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2.5,,"3,197 ft²",,188,600000
3,BANDAR BARU SRI PETALING,2023-05-25,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753 ft²,,531,400000
4,BANDAR BARU SRI PETALING,2023-05-22,SEMI-D,LEASEHOLD,2.5,,"4,801 ft²",,250,1200000
...,...,...,...,...,...,...,...,...,...,...
294562,HERITAGE STATION HOTEL,1990-11-13,FLAT,LEASEHOLD,1,2,493 ft²,493 ft²,71,35000
294563,IDAMAN PUTERI,2005-01-10,CONDOMINIUM,FREEHOLD,1,3,1454 ft²,1454 ft²,150,218025
294564,KELAB LE CHATEAU II,2008-02-25,CONDOMINIUM,FREEHOLD,1,3,593 ft²,593 ft²,194,115000
294565,MUTIARA SENTUL CONDOMINIUM,2009-08-10,APARTMENT,LEASEHOLD,1,2,1193 ft²,1193 ft²,197,235000


### Remove commas

In [23]:
# Change nan to NaN to ease processing
df_transactions = df_transactions.replace('nan', np.nan)
df_transactions

Unnamed: 0,project_name,spa_date,building_type,tenure,floors,rooms,land_area,built_up,price_psf,price
0,BANDAR BARU SRI PETALING,2023-06-09,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,1,,"2,196 ft²",,342,750000
1,BANDAR BARU SRI PETALING,2023-06-01,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753 ft²,,398,300000
2,BANDAR BARU SRI PETALING,2023-05-29,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2.5,,"3,197 ft²",,188,600000
3,BANDAR BARU SRI PETALING,2023-05-25,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753 ft²,,531,400000
4,BANDAR BARU SRI PETALING,2023-05-22,SEMI-D,LEASEHOLD,2.5,,"4,801 ft²",,250,1200000
...,...,...,...,...,...,...,...,...,...,...
294562,HERITAGE STATION HOTEL,1990-11-13,FLAT,LEASEHOLD,1,2,493 ft²,493 ft²,71,35000
294563,IDAMAN PUTERI,2005-01-10,CONDOMINIUM,FREEHOLD,1,3,1454 ft²,1454 ft²,150,218025
294564,KELAB LE CHATEAU II,2008-02-25,CONDOMINIUM,FREEHOLD,1,3,593 ft²,593 ft²,194,115000
294565,MUTIARA SENTUL CONDOMINIUM,2009-08-10,APARTMENT,LEASEHOLD,1,2,1193 ft²,1193 ft²,197,235000


In [24]:
# Remove commas from numeric columns
columns_with_comma = ['price_psf', 'price', 'land_area', 'built_up', 'rooms']

for column in columns_with_comma:
    df_transactions[column] = df_transactions[column].apply(
        lambda x: x if pd.isna(x) else x.replace(',', '')
    )

df_transactions

Unnamed: 0,project_name,spa_date,building_type,tenure,floors,rooms,land_area,built_up,price_psf,price
0,BANDAR BARU SRI PETALING,2023-06-09,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,1,,2196 ft²,,342,750000
1,BANDAR BARU SRI PETALING,2023-06-01,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753 ft²,,398,300000
2,BANDAR BARU SRI PETALING,2023-05-29,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2.5,,3197 ft²,,188,600000
3,BANDAR BARU SRI PETALING,2023-05-25,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753 ft²,,531,400000
4,BANDAR BARU SRI PETALING,2023-05-22,SEMI-D,LEASEHOLD,2.5,,4801 ft²,,250,1200000
...,...,...,...,...,...,...,...,...,...,...
294562,HERITAGE STATION HOTEL,1990-11-13,FLAT,LEASEHOLD,1,2,493 ft²,493 ft²,71,35000
294563,IDAMAN PUTERI,2005-01-10,CONDOMINIUM,FREEHOLD,1,3,1454 ft²,1454 ft²,150,218025
294564,KELAB LE CHATEAU II,2008-02-25,CONDOMINIUM,FREEHOLD,1,3,593 ft²,593 ft²,194,115000
294565,MUTIARA SENTUL CONDOMINIUM,2009-08-10,APARTMENT,LEASEHOLD,1,2,1193 ft²,1193 ft²,197,235000


### Remove units

In [25]:
# Remove units from numeric columns
columns_with_ft2 = ['land_area', 'built_up']

for column in columns_with_ft2:
    df_transactions[column] = df_transactions[column].apply(
        lambda x: x if pd.isna(x) else x.replace(' ft²', '')
    )

df_transactions

Unnamed: 0,project_name,spa_date,building_type,tenure,floors,rooms,land_area,built_up,price_psf,price
0,BANDAR BARU SRI PETALING,2023-06-09,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,1,,2196,,342,750000
1,BANDAR BARU SRI PETALING,2023-06-01,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753,,398,300000
2,BANDAR BARU SRI PETALING,2023-05-29,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2.5,,3197,,188,600000
3,BANDAR BARU SRI PETALING,2023-05-25,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753,,531,400000
4,BANDAR BARU SRI PETALING,2023-05-22,SEMI-D,LEASEHOLD,2.5,,4801,,250,1200000
...,...,...,...,...,...,...,...,...,...,...
294562,HERITAGE STATION HOTEL,1990-11-13,FLAT,LEASEHOLD,1,2,493,493,71,35000
294563,IDAMAN PUTERI,2005-01-10,CONDOMINIUM,FREEHOLD,1,3,1454,1454,150,218025
294564,KELAB LE CHATEAU II,2008-02-25,CONDOMINIUM,FREEHOLD,1,3,593,593,194,115000
294565,MUTIARA SENTUL CONDOMINIUM,2009-08-10,APARTMENT,LEASEHOLD,1,2,1193,1193,197,235000


In [26]:
# Remove units from numeric columns
columns_with_acre = ['land_area', 'built_up']

for column in columns_with_acre:
    acre_index = df_transactions[df_transactions[column].apply(lambda x: not str(x).isdigit())].index
    df_transactions[column] = df_transactions[column].apply(
        lambda x: x if pd.isna(x) else x.replace(' acre', '')
    )
    df_transactions.loc[acre_index, column] = df_transactions.loc[acre_index, column].apply(
        lambda x: float(x) * 43560
    )

df_transactions

Unnamed: 0,project_name,spa_date,building_type,tenure,floors,rooms,land_area,built_up,price_psf,price
0,BANDAR BARU SRI PETALING,2023-06-09,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,1,,2196,,342,750000
1,BANDAR BARU SRI PETALING,2023-06-01,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753,,398,300000
2,BANDAR BARU SRI PETALING,2023-05-29,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2.5,,3197,,188,600000
3,BANDAR BARU SRI PETALING,2023-05-25,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753,,531,400000
4,BANDAR BARU SRI PETALING,2023-05-22,SEMI-D,LEASEHOLD,2.5,,4801,,250,1200000
...,...,...,...,...,...,...,...,...,...,...
294562,HERITAGE STATION HOTEL,1990-11-13,FLAT,LEASEHOLD,1,2,493,493,71,35000
294563,IDAMAN PUTERI,2005-01-10,CONDOMINIUM,FREEHOLD,1,3,1454,1454,150,218025
294564,KELAB LE CHATEAU II,2008-02-25,CONDOMINIUM,FREEHOLD,1,3,593,593,194,115000
294565,MUTIARA SENTUL CONDOMINIUM,2009-08-10,APARTMENT,LEASEHOLD,1,2,1193,1193,197,235000


### Investigate missing values

In [27]:
print(
    f"Number of rows with missing values: {df_transactions['built_up'].isna().sum()}",
    f"\nPercentage of missing values: {df_transactions['built_up'].isna().sum() / len(df_transactions) * 100}",
)

Number of rows with missing values: 5613 
Percentage of missing values: 1.9055087637108028


In [28]:
df_transactions[df_transactions['built_up'].isna()]

Unnamed: 0,project_name,spa_date,building_type,tenure,floors,rooms,land_area,built_up,price_psf,price
0,BANDAR BARU SRI PETALING,2023-06-09,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,1,,2196,,342,750000
1,BANDAR BARU SRI PETALING,2023-06-01,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753,,398,300000
2,BANDAR BARU SRI PETALING,2023-05-29,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2.5,,3197,,188,600000
3,BANDAR BARU SRI PETALING,2023-05-25,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753,,531,400000
4,BANDAR BARU SRI PETALING,2023-05-22,SEMI-D,LEASEHOLD,2.5,,4801,,250,1200000
...,...,...,...,...,...,...,...,...,...,...
76747,TAMAN NAM FONG,2023-03-06,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,1540,,325,500000
76748,TAMAN NAM FONG,2021-03-03,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,1032,,618,638000
76749,TAMAN NAM FONG,2020-12-02,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,1540,,409,630000
76761,LAMAN BAYU,2023-04-26,BUNGALOW,FREEHOLD,3,,4553,,703,3200000


It seems that for transactions with missing `built_up` has missing values for `rooms` too.

In [29]:
df_transactions[df_transactions['built_up'].isna()]['rooms'].isna().sum()

5612

Out of 5613 with missing `built_up`, 5612 has missing `rooms`.

In [30]:
df_transactions[df_transactions['built_up'].isna()]['spa_date'].value_counts().sort_index()

spa_date
2000-05-10    1
2009-09-03    1
2011-09-30    1
2012-05-18    1
2015-05-07    1
             ..
2023-05-31    4
2023-06-01    1
2023-06-06    1
2023-06-08    1
2023-06-09    2
Name: count, Length: 906, dtype: int64

In [31]:
px.bar(df_transactions[df_transactions['built_up'].isna()]['spa_date'].value_counts())

  v = v.dt.to_pydatetime()


Missing `built_up` values mostly happened from year 2000 onwards.

In [32]:
px.bar(df_transactions[df_transactions['built_up'].isna()]['project_name'].value_counts())

Missing values scatter around a number of projects, not just specific to one. Hence missing values could be due to human error.

In [33]:
df_transactions[df_transactions['built_up'].isna()]['building_type'].value_counts()

building_type
TERRACE HOUSE - INTERMEDIATE    3318
BUNGALOW                         667
SEMI-D                           489
TERRACE HOUSE - CORNER LOT       360
TERRACE HOUSE - END LOT          302
TOWN HOUSE                       290
CLUSTER HOUSE - INTERMEDIATE     151
CLUSTER HOUSE                     23
CLUSTER HOUSE - END LOT           13
Name: count, dtype: int64

The missing values are all from landed properties. Terrace house has the most missing values followed by bungalow and semi-D.

In [34]:
px.bar(df_transactions[df_transactions['built_up'].isna()]['land_area'].value_counts())

In [35]:
px.bar(df_transactions[df_transactions['built_up'].isna()]['tenure'].value_counts())

In [36]:
px.bar(df_transactions[df_transactions['built_up'].isna()]['floors'].value_counts())

In [37]:
px.bar(df_transactions[df_transactions['built_up'].isna()]['price'].value_counts())

In [38]:
px.bar(df_transactions[df_transactions['built_up'].isna()]['price_psf'].value_counts())

Subsequent investigation on the transactions with missing `built_up` values versus other features revealed that the missing values are missing completely at random (MCAR), where the values are missing independently of other features.

Imputation strategy:
1. Random forest imputation (Jager et al., 2021) for MCAR, MAR and MNAR data in various domain
2. Multiple imputation by deterministic regression (Donlen, 2022) for MCAR data in real estate domain
3. MissForest (Waljee et al., 2013) for MCAR data in medical domain
4. Predictive mean matching, PMM (Heidt, 2019) for MAR data in medical domain
5. KNN imputation (Jadhav et al., 2019) for MCAR, MAR and MNAR data in UCI dataset

- Jager et al. (2021): https://www.frontiersin.org/articles/10.3389/fdata.2021.693674/full
- Donlen (2022): https://egrove.olemiss.edu/cgi/viewcontent.cgi?article=3744&context=hon_thesis
- Waljee et al. (2013): https://bmjopen.bmj.com/content/3/8/e002847.citation-tools
- Heidt (2019): https://dc.etsu.edu/cgi/viewcontent.cgi?article=5014&context=etd
- Jadhav et al (2019): https://www.tandfonline.com/doi/full/10.1080/08839514.2019.1637138

Based on the literature, we will explore:
1. Multiple imputation by deterministic regression
2. Random forest imputation
3. KNN imputation

But first, we find the correlation of the data points with respect to the prices.

In [39]:
type_conversion = {
    'project_name': 'category',
    'building_type': 'category',
    'tenure': 'category',
    'floors': 'float',
    'rooms': 'float',
    'land_area': 'float',
    'built_up': 'float',
    'price_psf': 'float',
    'price': 'float'
}

for column, dtype in type_conversion.items():
    df_transactions[column] = df_transactions[column].astype(dtype)

df_transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294567 entries, 0 to 294566
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   project_name   294567 non-null  category      
 1   spa_date       294567 non-null  datetime64[ns]
 2   building_type  294567 non-null  category      
 3   tenure         294567 non-null  category      
 4   floors         294567 non-null  float64       
 5   rooms          263897 non-null  float64       
 6   land_area      294567 non-null  float64       
 7   built_up       288954 non-null  float64       
 8   price_psf      294567 non-null  float64       
 9   price          294567 non-null  float64       
dtypes: category(3), datetime64[ns](1), float64(6)
memory usage: 16.9 MB


In [None]:
df_transactions['year'] = df_transactions['spa_date'].dt.year
df_transactions['month'] = df_transactions['spa_date'].dt.month

df_transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294567 entries, 0 to 294566
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   project_name   294567 non-null  category      
 1   spa_date       294567 non-null  datetime64[ns]
 2   building_type  294567 non-null  category      
 3   tenure         294567 non-null  category      
 4   floors         294567 non-null  float64       
 5   rooms          263897 non-null  float64       
 6   land_area      294567 non-null  float64       
 7   built_up       288954 non-null  float64       
 8   price_psf      294567 non-null  float64       
 9   price          294567 non-null  float64       
 10  year           294567 non-null  int32         
 11  month          294567 non-null  int32         
dtypes: category(3), datetime64[ns](1), float64(6), int32(2)
memory usage: 19.2 MB


In [86]:
px.imshow(df_transactions.drop(columns=['project_name', 'spa_date', 'building_type', 'tenure', 'month']).corr(), text_auto=True)

In [56]:
px.bar(df_transactions[['project_name', 'price_psf']].groupby('project_name').median().sort_values(by='price_psf', ascending=False))





In [90]:
px.bar(df_transactions.query('year > 1957')[['building_type', 'price_psf']].groupby('building_type').median().sort_values(by='price_psf', ascending=False), text_auto=True)





In [54]:
px.bar(df_transactions[['tenure', 'price_psf']].groupby('tenure').median().sort_values(by='price_psf', ascending=False), text_auto=True)





In [83]:
px.bar(df_transactions.query('year > 1957')[['year', 'price_psf']].groupby('year').median().sort_values(by='year', ascending=True), text_auto=True)

In [84]:
px.bar(df_transactions.query('year > 1957')[['month', 'price_psf']].groupby('month').median().sort_values(by='month', ascending=True), text_auto=True)

In [43]:
print(
    f"Number of rows with missing values: {df_transactions['rooms'].isna().sum()}",
    f"\nPercentage of missing values: {df_transactions['rooms'].isna().sum() / len(df_transactions) * 100}",
)

Number of rows with missing values: 30670 
Percentage of missing values: 10.411892710317177


In [44]:
df_transactions[df_transactions['rooms'].isna()]

Unnamed: 0,project_name,spa_date,building_type,tenure,floors,rooms,land_area,built_up,price_psf,price
0,BANDAR BARU SRI PETALING,2023-06-09,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,1,,2196,,342,750000
1,BANDAR BARU SRI PETALING,2023-06-01,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753,,398,300000
2,BANDAR BARU SRI PETALING,2023-05-29,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2.5,,3197,,188,600000
3,BANDAR BARU SRI PETALING,2023-05-25,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753,,531,400000
4,BANDAR BARU SRI PETALING,2023-05-22,SEMI-D,LEASEHOLD,2.5,,4801,,250,1200000
...,...,...,...,...,...,...,...,...,...,...
294507,PANTAI PANORAMA KONDO,2022-12-06,FLAT,LEASEHOLD,1,,657,657,297,195000
294521,WINSOR TOWER,2010-04-16,SERVICE RESIDENCE,FREEHOLD,1,,640,640,609,390000
294530,KAWASAN PERINDUSTRIAN TRISEGI,1999-09-27,FLAT,FREEHOLD,1,,511,511,88,45000
294532,TAMAN SUNGAI BESI (MEDIUM COST FLAT),1999-01-11,FLAT,FREEHOLD,1,,500,500,156,78000


Find literature on what is the best method to impute missing values