# Data Exploration and Cleaning: Transactions

In [5]:
import numpy as np
import pandas as pd
from helpers import DATA_DIR, RAW_DATA_DIR

## Exploration

In [7]:
df_transactions = pd.read_parquet(RAW_DATA_DIR / 'transactions_KL.parquet')
df_transactions

Unnamed: 0,project_name,spa_date,address,building_type,tenure,floors,rooms,land_area,built_up,price_psf,price
0,BANDAR BARU SRI PETALING,09/06/2023,"✕✕✕, JALAN PIKRAMA",TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,1,,"2,196 ft²",,342,750000
1,BANDAR BARU SRI PETALING,01/06/2023,"✕✕. ✕✕, JALAN PERLAK 3",TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753 ft²,,398,300000
2,BANDAR BARU SRI PETALING,29/05/2023,"✕✕ ✕, JALAN 12/149L",TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2½,,"3,197 ft²",,188,600000
3,BANDAR BARU SRI PETALING,25/05/2023,"✕✕. ✕✕✕, JALAN PASAI",TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753 ft²,,531,400000
4,BANDAR BARU SRI PETALING,22/05/2023,"✕✕, JALAN SRI PETALING 5",SEMI-D,LEASEHOLD,2½,,"4,801 ft²",,250,1200000
...,...,...,...,...,...,...,...,...,...,...,...
257297,HERITAGE STATION HOTEL,13/11/1990,"✕✕✕-✕✕✕, BB WANGSA MAJU",FLAT,LEASEHOLD,1,2,493,493,71,35000
257298,IDAMAN PUTERI,10/01/2005,"✕✕-✕, JALAN GOMBAK",CONDOMINIUM,FREEHOLD,1,3,1454,1454,150,218025
257299,KELAB LE CHATEAU II,25/02/2008,"✕-✕✕-✕, JALAN KIARA 3",CONDOMINIUM,FREEHOLD,1,3,593,593,194,115000
257300,MUTIARA SENTUL CONDOMINIUM,10/08/2009,"✕-✕-✕, OFF JALAN SENTUL",APARTMENT,LEASEHOLD,1,2,1193,1193,197,235000


In [8]:
df_transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257302 entries, 0 to 257301
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   project_name   257302 non-null  object
 1   spa_date       257302 non-null  object
 2   address        257283 non-null  object
 3   building_type  257302 non-null  object
 4   tenure         257302 non-null  object
 5   floors         257302 non-null  object
 6   rooms          257302 non-null  object
 7   land_area      257302 non-null  object
 8   built_up       257302 non-null  object
 9   price_psf      257302 non-null  object
 10  price          257302 non-null  object
dtypes: object(11)
memory usage: 21.6+ MB


At a glance:
1. `project_name` column contains strings. No missing values.
2. `spa_date` is detected as strings instead of date, need to change to date type. No missing values. 
3. `address` column contain strings. There are missing values.
4. `building_type` column contains strings. No missing values.
5. `tenure` column contains strings. No missing values.
6. `floors` is detected as strings instead of float, need to change to float type. No missing values.
7. `rooms` is automatically detected as float. There are 'nan' but not detected as missing values.
8. `land_area` is detected as strings instead of float, need to change to float type. No missing values.
9. `built_up` is detected as strings instead of float, need to change to float type. There are missing values.
10. `price` is detected as strings instead of float, need to change to float type. No missing values.

In [9]:
df_transactions['spa_date'] = pd.to_datetime(df_transactions['spa_date'], format='%d/%m/%Y')
df_transactions['spa_date']

0        2023-06-09
1        2023-06-01
2        2023-05-29
3        2023-05-25
4        2023-05-22
            ...    
257297   1990-11-13
257298   2005-01-10
257299   2008-02-25
257300   2009-08-10
257301   1995-08-18
Name: spa_date, Length: 257302, dtype: datetime64[ns]

In [10]:
df_transactions['spa_date'].sort_values()

56134    1909-05-11
91390    1933-10-25
224865   1955-07-25
64404    1959-07-27
229004   1960-11-14
            ...    
186872   2023-06-09
0        2023-06-09
229397   2023-06-13
177426   2023-06-14
88177    2023-06-19
Name: spa_date, Length: 257302, dtype: datetime64[ns]

The data type of the `spa_date` column has been changed to datetime type.

In [11]:
print(f"% missing values: {df_transactions['address'].isnull().sum() / len(df_transactions) * 100}")
df_transactions[df_transactions['address'].isnull()]

% missing values: 0.00738431881602164


Unnamed: 0,project_name,spa_date,address,building_type,tenure,floors,rooms,land_area,built_up,price_psf,price
18211,ALAM DAMAI,1999-02-15,,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,3.0,"1,647 ft²","1,087 ft²",107,175750
27824,TAMAN SRI HARTAMAS,1996-01-10,,TERRACE HOUSE - INTERMEDIATE,FREEHOLD,2,3.0,"1,650 ft²","1,671 ft²",262,432000
70435,KAMPUNG BATU MUDA,2000-06-06,,BUNGALOW,LEASEHOLD,1,3.0,"4,004 ft²",834 ft²,30,120000
71295,LUCKY GARDEN (JALAN PUCHONG),1997-04-21,,TERRACE HOUSE - INTERMEDIATE,FREEHOLD,1,3.0,"1,754 ft²",869 ft²,114,200000
71296,LUCKY GARDEN (JALAN PUCHONG),1997-04-21,,TERRACE HOUSE - INTERMEDIATE,FREEHOLD,1,3.0,"1,775 ft²",869 ft²,113,200000
74962,SRI BINTANG HEIGHTS,1998-04-29,,TERRACE HOUSE - INTERMEDIATE,FREEHOLD,2,3.0,"1,787 ft²","1,323 ft²",123,220000
75638,LAMAN RESIDENCE,2008-09-15,,SEMI-D,LEASEHOLD,2,4.0,"3,057 ft²","2,485 ft²",212,648000
110551,SALAK SOUTH GARDEN,2002-10-29,,FLAT,FREEHOLD,1,2.0,625,625,56,35000
110578,SALAK SOUTH GARDEN,1999-10-13,,FLAT,FREEHOLD,1,2.0,607,607,92,55800
110580,SALAK SOUTH GARDEN,1999-09-13,,FLAT,FREEHOLD,1,2.0,610,610,109,66500


We can drop the `address` column based on the following reasons:
1. No detailed locational scope is considered in this study
2. Only high-level geographical scope is considered

In [12]:
df_transactions['building_type'].value_counts()

building_type
CONDOMINIUM                      91191
TERRACE HOUSE - INTERMEDIATE     47865
APARTMENT                        34417
FLAT                             30679
SERVICE RESIDENCE                24237
BUNGALOW                          7871
SEMI-D                            5824
TOWN HOUSE                        4983
TERRACE HOUSE - CORNER LOT        3946
TERRACE HOUSE - END LOT           2966
CLUSTER HOUSE - INTERMEDIATE      2588
CLUSTER HOUSE                      506
CLUSTER HOUSE - CORNER LOT         128
CLUSTER HOUSE - END LOT             99
CONDOMINIUM - CLUB                   1
CONDOMINIUM - RESIDENTIAL USE        1
Name: count, dtype: int64

Findings:
1. Majority of the transactions are condominium, followed by intermediate terrace house and apartment
2. Cluster house has an unknown category which does not fall into intermediate, corner lot or end lot.
3. Condominium has a weird category called "residential use"

In [13]:
print(f"% Freehold: {df_transactions['tenure'].value_counts()['FREEHOLD'] / len(df_transactions) * 100}")
df_transactions['tenure'].value_counts()

% Freehold: 61.83278793013657


tenure
FREEHOLD     159097
LEASEHOLD     98205
Name: count, dtype: int64

Findings:
1. Majority of the transactions are freehold (61.83%)

In [14]:
df_transactions['floors'].value_counts()

floors
1     195232
2      44132
3      11788
2½      3059
0       1175
4        967
3½       264
1½       256
6        154
5        150
8         41
4½        18
9         17
20        15
10        11
7          7
11         4
13         2
15         2
99         2
24         2
46         1
12         1
35         1
21         1
Name: count, dtype: int64

In [15]:
df_transactions['floors'].isnull().sum()

0

Findings:
1. Half a floor are represented by 1/2 instead of .5
2. There are weird values like:
    - 0 floor
    - 99 floors
3. Majority of floors are 2 floors
4. No missing values

In [16]:
df_transactions['rooms'].value_counts()

rooms
3       104800
3.0      38041
2        30944
nan      26899
4.0      19459
         ...  
31           1
18           1
102          1
54           1
13.0         1
Name: count, Length: 69, dtype: int64

In [20]:
(df_transactions['rooms'] == 'nan').sum() / len(df_transactions) * 100

10.454252201692952

Findings:
1. Weird number of rooms:
    - 0 room
    - 46 rooms
2. Majority of the transactions are 3 rooms
3. 10.45% of the transactions have NaN

In [21]:
df_transactions['land_area'].value_counts()

land_area
1,647 ft²     3652
883 ft²       2796
1,539 ft²     2388
1,540 ft²     1882
1,650 ft²     1876
              ... 
3,191 ft²        1
37,943 ft²       1
3,909 ft²        1
3,252 ft²        1
511              1
Name: count, Length: 9355, dtype: int64

In [22]:
df_transactions['land_area'].isnull().sum() / len(df_transactions) * 100

0.0

Findings:
1. There are "ft2" and "," in land_area. Need to remove.
2. No missing values

In [23]:
df_transactions['built_up'].value_counts()

built_up
nan          5613
1000         1648
549          1332
926          1240
861 ft²      1163
             ... 
3,075 ft²       1
2,431 ft²       1
4,413 ft²       1
3,357 ft²       1
511             1
Name: count, Length: 7973, dtype: int64

In [25]:
(df_transactions['built_up'] == 'nan').sum() / len(df_transactions) * 100

2.1814832375962876

Findings:
1. 2.18% of the data is missing. Need further investigation
2. There are "ft2" and "," in the data. Need to remove them

In [26]:
df_transactions['price_psf'].value_counts()

price_psf
155      994
159      922
170      898
152      898
143      875
        ... 
2,218      1
1,906      1
2,039      1
2,524      1
2,326      1
Name: count, Length: 2140, dtype: int64

In [27]:
df_transactions['price_psf'].isna().sum() / len(df_transactions) * 100

0.0

Findings:
1. No missing values
2. Have "," in the numbers. Need to remove them

In [28]:
df_transactions['price'].value_counts()

price
300,000    3278
200,000    3051
250,000    3004
150,000    2971
500,000    2901
           ... 
297,900       1
290,900       1
231,400       1
204,400       1
518,296       1
Name: count, Length: 17654, dtype: int64

In [29]:
df_transactions['price'].isna().sum() / len(df_transactions) * 100

0.0

Findings:
1. No missing data.
2. Have "," in the data. Need to remove.

### Concluding Remarks
1. How does each dataset look like? Is it in row format or it has merged cells?
    - It is in row format, with no merged cells.
2. What is the start and end of each dataset?
    - From 1909 to 2023
3. Is the dataset from the area of interest, Kuala Lumpur?
    - Yes
4. How is the data quality?
    - There are missing values in some columns (`built_up` and `rooms`). Need to decide to remove rows or impute.
    - Values in some columns (`land_area` and `built_up`) has unit of measurement like "ft2". Need to remove.
    - Values in `floors` column uses fraction instead of decimal. Need to change.
    - Values in numerical columns (`land_area`, `built_up`, `price_psf` and `price`) has comma. Need to remove.
    - Some values in some columns (`floors` and `rooms`) does not make sense. Need further investigation.
    - Only landed properties are in the dataset. Need another round of scraping for high rise real estate.

## Preparation

Selection:
1. Drop `address` column

Cleaning:
1. Change fraction in `floors` column, i.e. 1/2 to .5
2. Remove comma in `price_psf`
3. Remove comma in `price`
4. Remove comma and ft2 in `land_area` and `built_up`
5. Investigate weird `floors` values (0 and 99) and remove/impute
6. Investigate weird `rooms` values (0 and 46) and remove/impute
7. Investigate missing values in `built_up` and remove/impute
8. Investigate missing values in `rooms` and remove/impute

There is a need for additional scraping of Brickz.my for high rise real estate.

In [30]:
df_transactions = df_transactions.drop(columns=['address'])
df_transactions

Unnamed: 0,project_name,spa_date,building_type,tenure,floors,rooms,land_area,built_up,price_psf,price
0,BANDAR BARU SRI PETALING,2023-06-09,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,1,,"2,196 ft²",,342,750000
1,BANDAR BARU SRI PETALING,2023-06-01,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753 ft²,,398,300000
2,BANDAR BARU SRI PETALING,2023-05-29,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2½,,"3,197 ft²",,188,600000
3,BANDAR BARU SRI PETALING,2023-05-25,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753 ft²,,531,400000
4,BANDAR BARU SRI PETALING,2023-05-22,SEMI-D,LEASEHOLD,2½,,"4,801 ft²",,250,1200000
...,...,...,...,...,...,...,...,...,...,...
257297,HERITAGE STATION HOTEL,1990-11-13,FLAT,LEASEHOLD,1,2,493,493,71,35000
257298,IDAMAN PUTERI,2005-01-10,CONDOMINIUM,FREEHOLD,1,3,1454,1454,150,218025
257299,KELAB LE CHATEAU II,2008-02-25,CONDOMINIUM,FREEHOLD,1,3,593,593,194,115000
257300,MUTIARA SENTUL CONDOMINIUM,2009-08-10,APARTMENT,LEASEHOLD,1,2,1193,1193,197,235000


In [31]:
from helpers import convert_mixed_fraction_to_decimal

df_transactions['floors'] = df_transactions['floors'].apply(lambda x: convert_mixed_fraction_to_decimal(x))
df_transactions

Unnamed: 0,project_name,spa_date,building_type,tenure,floors,rooms,land_area,built_up,price_psf,price
0,BANDAR BARU SRI PETALING,2023-06-09,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,1,,"2,196 ft²",,342,750000
1,BANDAR BARU SRI PETALING,2023-06-01,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753 ft²,,398,300000
2,BANDAR BARU SRI PETALING,2023-05-29,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2.5,,"3,197 ft²",,188,600000
3,BANDAR BARU SRI PETALING,2023-05-25,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753 ft²,,531,400000
4,BANDAR BARU SRI PETALING,2023-05-22,SEMI-D,LEASEHOLD,2.5,,"4,801 ft²",,250,1200000
...,...,...,...,...,...,...,...,...,...,...
257297,HERITAGE STATION HOTEL,1990-11-13,FLAT,LEASEHOLD,1,2,493,493,71,35000
257298,IDAMAN PUTERI,2005-01-10,CONDOMINIUM,FREEHOLD,1,3,1454,1454,150,218025
257299,KELAB LE CHATEAU II,2008-02-25,CONDOMINIUM,FREEHOLD,1,3,593,593,194,115000
257300,MUTIARA SENTUL CONDOMINIUM,2009-08-10,APARTMENT,LEASEHOLD,1,2,1193,1193,197,235000


In [32]:
columns_with_comma = ['price_psf', 'price', 'land_area', 'built_up']

for column in columns_with_comma:
    df_transactions[column] = df_transactions[column].apply(
        lambda x: x if pd.isna(x) else x.replace(',', '')
    )

df_transactions

Unnamed: 0,project_name,spa_date,building_type,tenure,floors,rooms,land_area,built_up,price_psf,price
0,BANDAR BARU SRI PETALING,2023-06-09,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,1,,2196 ft²,,342,750000
1,BANDAR BARU SRI PETALING,2023-06-01,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753 ft²,,398,300000
2,BANDAR BARU SRI PETALING,2023-05-29,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2.5,,3197 ft²,,188,600000
3,BANDAR BARU SRI PETALING,2023-05-25,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753 ft²,,531,400000
4,BANDAR BARU SRI PETALING,2023-05-22,SEMI-D,LEASEHOLD,2.5,,4801 ft²,,250,1200000
...,...,...,...,...,...,...,...,...,...,...
257297,HERITAGE STATION HOTEL,1990-11-13,FLAT,LEASEHOLD,1,2,493,493,71,35000
257298,IDAMAN PUTERI,2005-01-10,CONDOMINIUM,FREEHOLD,1,3,1454,1454,150,218025
257299,KELAB LE CHATEAU II,2008-02-25,CONDOMINIUM,FREEHOLD,1,3,593,593,194,115000
257300,MUTIARA SENTUL CONDOMINIUM,2009-08-10,APARTMENT,LEASEHOLD,1,2,1193,1193,197,235000


In [33]:
columns_with_ft2 = ['land_area', 'built_up']

for column in columns_with_ft2:
    df_transactions[column] = df_transactions[column].apply(
        lambda x: x if pd.isna(x) else x.replace(' ft²', '')
    )

df_transactions

Unnamed: 0,project_name,spa_date,building_type,tenure,floors,rooms,land_area,built_up,price_psf,price
0,BANDAR BARU SRI PETALING,2023-06-09,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,1,,2196,,342,750000
1,BANDAR BARU SRI PETALING,2023-06-01,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753,,398,300000
2,BANDAR BARU SRI PETALING,2023-05-29,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2.5,,3197,,188,600000
3,BANDAR BARU SRI PETALING,2023-05-25,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753,,531,400000
4,BANDAR BARU SRI PETALING,2023-05-22,SEMI-D,LEASEHOLD,2.5,,4801,,250,1200000
...,...,...,...,...,...,...,...,...,...,...
257297,HERITAGE STATION HOTEL,1990-11-13,FLAT,LEASEHOLD,1,2,493,493,71,35000
257298,IDAMAN PUTERI,2005-01-10,CONDOMINIUM,FREEHOLD,1,3,1454,1454,150,218025
257299,KELAB LE CHATEAU II,2008-02-25,CONDOMINIUM,FREEHOLD,1,3,593,593,194,115000
257300,MUTIARA SENTUL CONDOMINIUM,2009-08-10,APARTMENT,LEASEHOLD,1,2,1193,1193,197,235000


In [27]:
print(
    f"Number of rows with missing values: {df_transactions['built_up'].isna().sum()}",
    f"\nPercentage of missing values: {df_transactions['built_up'].isna().sum() / len(df_transactions) * 100}",
)

Number of rows with missing values: 5613 
Percentage of missing values: 7.3108783994998445


In [29]:
df_transactions[df_transactions['built_up'].isna()]

Unnamed: 0,project_name,spa_date,building_type,tenure,floors,rooms,land_area,built_up,price_psf,price
0,BANDAR BARU SRI PETALING,2023-06-09,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,1,,2196,,342,750000
1,BANDAR BARU SRI PETALING,2023-06-01,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753,,398,300000
2,BANDAR BARU SRI PETALING,2023-05-29,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2.5,,3197,,188,600000
3,BANDAR BARU SRI PETALING,2023-05-25,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753,,531,400000
4,BANDAR BARU SRI PETALING,2023-05-22,SEMI-D,LEASEHOLD,2.5,,4801,,250,1200000
...,...,...,...,...,...,...,...,...,...,...
76747,TAMAN NAM FONG,2023-03-06,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,1540,,325,500000
76748,TAMAN NAM FONG,2021-03-03,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,1032,,618,638000
76749,TAMAN NAM FONG,2020-12-02,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,1540,,409,630000
76761,LAMAN BAYU,2023-04-26,BUNGALOW,FREEHOLD,3,,4553,,703,3200000


In [30]:
print(
    f"Number of rows with missing values: {df_transactions['rooms'].isna().sum()}",
    f"\nPercentage of missing values: {df_transactions['rooms'].isna().sum() / len(df_transactions) * 100}",
)

Number of rows with missing values: 6491 
Percentage of missing values: 8.454464936959466


Find literature on what is the best method to impute missing values