In [18]:
import pandas as pd
from helpers import DATA_DIR, RAW_DATA_DIR

In [19]:
df_transactions = pd.read_excel(RAW_DATA_DIR / 'transactions_KL.xlsx')
df_transactions

Unnamed: 0,project_name,spa_date,address,building_type,tenure,floors,rooms,land_area,built_up,price_psf,price
0,BANDAR BARU SRI PETALING,09/06/2023,"✕✕✕, JALAN PIKRAMA",TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,1,,"2,196 ft²",,342,750000
1,BANDAR BARU SRI PETALING,01/06/2023,"✕✕. ✕✕, JALAN PERLAK 3",TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753 ft²,,398,300000
2,BANDAR BARU SRI PETALING,29/05/2023,"✕✕ ✕, JALAN 12/149L",TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2½,,"3,197 ft²",,188,600000
3,BANDAR BARU SRI PETALING,25/05/2023,"✕✕. ✕✕✕, JALAN PASAI",TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753 ft²,,531,400000
4,BANDAR BARU SRI PETALING,22/05/2023,"✕✕, JALAN SRI PETALING 5",SEMI-D,LEASEHOLD,2½,,"4,801 ft²",,250,1200000
...,...,...,...,...,...,...,...,...,...,...,...
76771,LAMAN BAYU,30/10/2012,"✕✕, JALAN SERI BAYU LAMAN BAYU",BUNGALOW,FREEHOLD,3½,8.0,"4,673 ft²","4,297 ft²",698,3260000
76772,LAMAN BAYU,25/10/2012,"✕✕, 22/38A",BUNGALOW,FREEHOLD,3½,8.0,"4,692 ft²","4,297 ft²",682,3200000
76773,LAMAN BAYU,15/08/2012,"✕✕, LAMAN BAYU",BUNGALOW,FREEHOLD,3,5.0,"4,514 ft²","3,050 ft²",695,3138880
76774,LAMAN BAYU,18/04/2011,"✕, JALAN SERI BAYU",BUNGALOW,FREEHOLD,3,8.0,"4,514 ft²","3,050 ft²",640,2888880


In [20]:
df_transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76776 entries, 0 to 76775
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   project_name   76776 non-null  object 
 1   spa_date       76776 non-null  object 
 2   address        76769 non-null  object 
 3   building_type  76776 non-null  object 
 4   tenure         76776 non-null  object 
 5   floors         76776 non-null  object 
 6   rooms          70285 non-null  float64
 7   land_area      76776 non-null  object 
 8   built_up       71163 non-null  object 
 9   price_psf      76776 non-null  object 
 10  price          76776 non-null  object 
dtypes: float64(1), object(10)
memory usage: 6.4+ MB


At a glance:
1. `project_name` column contains strings. No missing values.
2. `spa_date` is detected as strings instead of date, need to change to date type. No missing values. 
3. `address` column contain strings. There are missing values.
4. `building_type` column contains strings. No missing values.
5. `tenure` column contains strings. No missing values.
6. `floors` is detected as strings instead of float, need to change to float type. No missing values.
7. `rooms` is automatically detected as float. There are missing values.
8. `land_area` is detected as strings instead of float, need to change to float type. No missing values.
9. `built_up` is detected as strings instead of float, need to change to float type. There are missing values.
10. `price` is detected as strings instead of float, need to change to float type. No missing values.

In [21]:
df_transactions['spa_date'] = pd.to_datetime(df_transactions['spa_date'], format='%d/%m/%Y')
df_transactions['spa_date']

0       2023-06-09
1       2023-06-01
2       2023-05-29
3       2023-05-25
4       2023-05-22
           ...    
76771   2012-10-30
76772   2012-10-25
76773   2012-08-15
76774   2011-04-18
76775   2011-03-08
Name: spa_date, Length: 76776, dtype: datetime64[ns]

The data type of the `spa_date` column has been changed to datetime type.

In [22]:
print(f"% missing values: {df_transactions['address'].isnull().sum() / len(df_transactions) * 100}")
df_transactions[df_transactions['address'].isnull()]

% missing values: 0.00911743253099927


Unnamed: 0,project_name,spa_date,address,building_type,tenure,floors,rooms,land_area,built_up,price_psf,price
18211,ALAM DAMAI,1999-02-15,,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,3.0,"1,647 ft²","1,087 ft²",107,175750
27824,TAMAN SRI HARTAMAS,1996-01-10,,TERRACE HOUSE - INTERMEDIATE,FREEHOLD,2,3.0,"1,650 ft²","1,671 ft²",262,432000
70435,KAMPUNG BATU MUDA,2000-06-06,,BUNGALOW,LEASEHOLD,1,3.0,"4,004 ft²",834 ft²,30,120000
71295,LUCKY GARDEN (JALAN PUCHONG),1997-04-21,,TERRACE HOUSE - INTERMEDIATE,FREEHOLD,1,3.0,"1,754 ft²",869 ft²,114,200000
71296,LUCKY GARDEN (JALAN PUCHONG),1997-04-21,,TERRACE HOUSE - INTERMEDIATE,FREEHOLD,1,3.0,"1,775 ft²",869 ft²,113,200000
74962,SRI BINTANG HEIGHTS,1998-04-29,,TERRACE HOUSE - INTERMEDIATE,FREEHOLD,2,3.0,"1,787 ft²","1,323 ft²",123,220000
75638,LAMAN RESIDENCE,2008-09-15,,SEMI-D,LEASEHOLD,2,4.0,"3,057 ft²","2,485 ft²",212,648000


We can drop these columns based on the following reasons:
1. Transactions with missing address only accounts for 0.009% of the entire dataset
2. Majority of the transactions with missing address (five out of seven) are dated before year 2000

In [23]:
df_transactions = df_transactions[df_transactions['address'].notnull()]
df_transactions

Unnamed: 0,project_name,spa_date,address,building_type,tenure,floors,rooms,land_area,built_up,price_psf,price
0,BANDAR BARU SRI PETALING,2023-06-09,"✕✕✕, JALAN PIKRAMA",TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,1,,"2,196 ft²",,342,750000
1,BANDAR BARU SRI PETALING,2023-06-01,"✕✕. ✕✕, JALAN PERLAK 3",TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753 ft²,,398,300000
2,BANDAR BARU SRI PETALING,2023-05-29,"✕✕ ✕, JALAN 12/149L",TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2½,,"3,197 ft²",,188,600000
3,BANDAR BARU SRI PETALING,2023-05-25,"✕✕. ✕✕✕, JALAN PASAI",TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753 ft²,,531,400000
4,BANDAR BARU SRI PETALING,2023-05-22,"✕✕, JALAN SRI PETALING 5",SEMI-D,LEASEHOLD,2½,,"4,801 ft²",,250,1200000
...,...,...,...,...,...,...,...,...,...,...,...
76771,LAMAN BAYU,2012-10-30,"✕✕, JALAN SERI BAYU LAMAN BAYU",BUNGALOW,FREEHOLD,3½,8.0,"4,673 ft²","4,297 ft²",698,3260000
76772,LAMAN BAYU,2012-10-25,"✕✕, 22/38A",BUNGALOW,FREEHOLD,3½,8.0,"4,692 ft²","4,297 ft²",682,3200000
76773,LAMAN BAYU,2012-08-15,"✕✕, LAMAN BAYU",BUNGALOW,FREEHOLD,3,5.0,"4,514 ft²","3,050 ft²",695,3138880
76774,LAMAN BAYU,2011-04-18,"✕, JALAN SERI BAYU",BUNGALOW,FREEHOLD,3,8.0,"4,514 ft²","3,050 ft²",640,2888880


We dropped the rows with missing address. The dataset has 76769 rows left.

In [25]:
df_transactions['building_type'].value_counts()

building_type
TERRACE HOUSE - INTERMEDIATE    47860
BUNGALOW                         7870
SEMI-D                           5823
TOWN HOUSE                       4983
TERRACE HOUSE - CORNER LOT       3946
TERRACE HOUSE - END LOT          2966
CLUSTER HOUSE - INTERMEDIATE     2588
CLUSTER HOUSE                     506
CLUSTER HOUSE - CORNER LOT        128
CLUSTER HOUSE - END LOT            99
Name: count, dtype: int64

Findings:
1. Majority of the transactions are intermediate terrace houses
2. Surprisingly, there are no apartments or condominiums in this dataset, thus another round of scraping is inevitable

In [30]:
print(f"% Freehold: {df_transactions['tenure'].value_counts()['FREEHOLD'] / len(df_transactions) * 100}")
df_transactions['tenure'].value_counts()

% Freehold: 60.112805950318496


tenure
FREEHOLD     46148
LEASEHOLD    30621
Name: count, dtype: int64

Findings:
1. Majority of the transactions are freehold (60.11%)
2. The need of sampling will be decided after scraping of apartments and condominiums is completed

In [31]:
df_transactions['floors'].value_counts()

floors
2     44128
1     14703
3     11788
2½     3059
0      1175
4       967
3½      264
1½      256
6       154
5       150
8        41
4½       18
9        17
20       15
10       11
7         7
11        4
13        2
15        2
99        2
24        2
46        1
12        1
35        1
21        1
Name: count, dtype: int64