# Transactions - Data Integration

In [1]:
import os
from typing import Union

import joblib
import numpy as np
import pandas as pd
import skops.io as sio
from tqdm import tqdm

import helpers
from helpers import (
    CHARTS_DIR, ENCODER_MODEL_DIR, RAW_DATA_DIR, TRANSFORMED_DATA_DIR
)

import plotly.express as px
import plotly.graph_objects as go

## Load imputed data and decode encoding

In [2]:
encoder_path = ENCODER_MODEL_DIR / 'one_hot_encoder_township_building_type_tenure.joblib'

if os.path.exists(encoder_path):
    ohe = joblib.load(encoder_path)
else:
    raise Exception(f'Encoder not found at {encoder_path}')

In [None]:
df_imputed = pd.read_parquet(TRANSFORMED_DATA_DIR / 'transactions_KL_ckpt5_multi_imputed_bayesianridge.parquet')
df_imputed = ohe.inverse_transform(df_imputed)

In [8]:
type_conversion = {
    'township': 'category',
    'building_type': 'category',
    'tenure': 'category',
    'floors': 'int',
    'rooms': 'int',
    'land_area': 'float',
    'built_up': 'float',
    'price_psf': 'float',
    'price': 'float',
    'year': 'int',
    'month': 'int',
    'day': 'int'
}

for column, dtype in type_conversion.items():
    df_imputed[column] = df_imputed[column].astype(dtype)

df_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 265270 entries, 0 to 265269
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   township       265270 non-null  category
 1   building_type  265270 non-null  category
 2   tenure         265270 non-null  category
 3   floors         265270 non-null  int32   
 4   rooms          265270 non-null  int32   
 5   land_area      265270 non-null  float64 
 6   built_up       265270 non-null  float64 
 7   price_psf      265270 non-null  float64 
 8   price          265270 non-null  float64 
 9   year           265270 non-null  int32   
 10  month          265270 non-null  int32   
 11  day            265270 non-null  int32   
dtypes: category(3), float64(4), int32(5)
memory usage: 14.2 MB


## Load economic indicators for integration

In [9]:
df_opr = pd.read_excel(RAW_DATA_DIR / 'bnm-opr_historical-2004-2023.xlsx')
df_opr.head()

Unnamed: 0,date,opr_change_percentage,new_opr_percentage
0,06 Jul 2023,0.0,3.0
1,03 May 2023,0.25,3.0
2,09 Mar 2023,0.0,2.75
3,19 Jan 2023,0.0,2.75
4,03 Nov 2022,0.25,2.75


In [10]:
df_cpi = pd.read_csv(RAW_DATA_DIR / 'dosm-cpi-2010-2023.csv')
df_cpi.head()

Unnamed: 0,date,overall,food_beverage,alcohol_tobacco,clothing_footwear,housing_utilities,furnishings,health,transport,communication,recreation_culture,education,hospitality,misc
0,2010-01-01,99.4,99.0,98.3,101.2,99.4,100.0,99.4,98.9,100.3,100.2,99.5,98.7,99.9
1,2010-02-01,99.4,99.0,98.3,100.1,99.6,99.9,99.4,99.0,100.3,100.2,99.6,98.9,99.6
2,2010-03-01,99.4,99.0,98.4,100.0,99.6,100.0,99.3,99.0,100.2,100.2,99.8,99.3,99.6
3,2010-04-01,99.4,99.1,98.4,100.5,99.6,100.0,99.7,99.0,99.7,100.1,99.9,99.5,99.5
4,2010-05-01,99.6,99.5,98.4,100.3,99.8,99.1,99.8,99.0,99.7,100.0,99.9,99.6,99.7


In [11]:
df_income = pd.read_csv(RAW_DATA_DIR / 'dosm-households_and_income_by_state-1984-2022.csv')
df_income.head()

Unnamed: 0,date,state,n_households,median,mean,gini
0,1970-01-01,Malaysia,,,264.0,0.5129
1,1970-01-01,Johor,,,237.0,
2,1970-01-01,Kedah,,,189.0,
3,1970-01-01,Kelantan,,,151.0,
4,1970-01-01,Melaka,,,265.0,


In [12]:
df_wellbeing = pd.read_csv(RAW_DATA_DIR / 'dosm-malaysian_wellbeing_index-2000-2021.csv')
df_wellbeing.head()

Unnamed: 0,Component,Sub Component,Year,Index
0,Economic Well-being,Total,2000,100.0
1,Economic Well-being,Total,2001,100.2
2,Economic Well-being,Total,2002,101.5
3,Economic Well-being,Total,2003,104.2
4,Economic Well-being,Total,2004,105.8


In [13]:
df_money = pd.read_csv(RAW_DATA_DIR / 'dosm-total_money_supply-2013-2021.csv')
df_money.head()

Unnamed: 0,Year,Monetary Aggregat,Money Supply,Category,RM Million
0,2013,M1,Narrow money supply,Currency in circulation,62980.6
1,2014,M1,Narrow money supply,Currency in circulation,68170.0
2,2015,M1,Narrow money supply,Currency in circulation,76869.1
3,2016,M1,Narrow money supply,Currency in circulation,85682.4
4,2017,M1,Narrow money supply,Currency in circulation,92607.6


In [14]:
df_unemployment = pd.read_csv(RAW_DATA_DIR / 'dosm-unemployment_rate_KL-1982-2021.csv')
df_unemployment.head()

Unnamed: 0,date,lf_size,employed,unemployed,outside,u_rate,p_rate,ep_ratio
0,1982-01-01,5431.4,5249.0,182.4,2944.6,3.4,64.8,62.67
1,1983-01-01,5671.8,5457.0,214.9,2969.4,3.8,65.6,63.15
2,1984-01-01,5862.5,5566.7,295.8,3119.6,5.0,65.3,61.98
3,1985-01-01,5990.1,5653.4,336.8,3124.9,5.6,65.7,62.02
4,1986-01-01,6222.1,5760.1,461.9,3188.3,7.4,66.1,61.21


In [15]:
df_population = pd.read_excel(RAW_DATA_DIR / 'dosm-population_KL-2016-2020.xlsx', skiprows=1)
df_population = df_population.rename(columns={
    'Unnamed: 0': 'Year',
    'Unnamed: 1': 'State',
    'Female': "Female ('000)",
    'Male': "Male ('000)",
    'Total': "Total ('000)"
})
df_population.head()

Unnamed: 0,Year,State,Administrative District,Female ('000),Male ('000),Total ('000)
0,2016,W.P. Kuala Lumpur,N.A,890.1,956.3,1846.4
1,2017,W.P. Kuala Lumpur,N.A,898.4,965.3,1863.7
2,2018,W.P. Kuala Lumpur,N.A,906.3,973.9,1880.2
3,2019,W.P. Kuala Lumpur,N.A,913.8,982.0,1895.8
4,2020,W.P. Kuala Lumpur,N.A,921.0,989.7,1910.7


In [16]:
df_crime = pd.read_excel(RAW_DATA_DIR / 'dosm-number_of_crime_by_state-2015-2018.xlsx', sheet_name='Transformed')
df_crime['Year'] = df_crime['Year'].ffill().astype(int)
df_crime = df_crime.drop(columns='Crime Type')
df_crime = df_crime.groupby('Year').sum()
df_crime

Unnamed: 0_level_0,Brickfields,Cheras,Dang Wangi,Sentul,Wangsa Maju,Total
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015,597,564,1001,892,592,3646
2016,609,458,1061,228,845,3201
2017,579,472,1352,251,718,3372
2018,418,279,1354,208,578,2837


The data period of the economic indicators:
- BNM OPR: 2004-2023
- DOSM CPI: 2010-2023
- DOSM households and income: 1970-2022
- DOSM wellbeing index: 2000-2021
- DOSM total money supply: 2013-2021
- DOSM unemployment rate: 1982-2021
- DOSM population: 2016-2020
- DOSM crime rate: 2015-2018

The data period of economic indicators are not consistent. Although the transaction data period starts from 1909 and it is also good to have a longer data period, the economic indicators are not available for the whole period. Thus a start year and end year must be selected. Number of transactions from 1957 to 2020 is 247,541 while the number of transactions from 2000 to 2020 is 212,548. The data from 2000 to 2020 takes up 86% of the data from 1957 to 2020, which is an acceptable range given that the economic indicators are mostly in 2000s.

Combined with the significance of the Millennia (Year 2000), the data period of the economic indicators will be limited to 2000-2020. THe prediction period will be 2021-2023.

In [17]:
df_transactions = df_imputed.query('2000 <= year <= 2020')
df_transactions

Unnamed: 0,township,building_type,tenure,floors,rooms,land_area,built_up,price_psf,price,year,month,day
213,BANDAR BARU SRI PETALING,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,3,753.0,1299.778874,133.0,100000.0,2020,12,23
214,BANDAR BARU SRI PETALING,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,3,807.0,1279.650813,335.0,270000.0,2020,12,23
215,BANDAR BARU SRI PETALING,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,3,2196.0,1375.836875,322.0,708000.0,2020,12,21
216,BANDAR BARU SRI PETALING,TERRACE HOUSE - CORNER LOT,LEASEHOLD,1,3,3993.0,1334.898917,279.0,1115000.0,2020,12,21
217,BANDAR BARU SRI PETALING,TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,3,1765.0,1371.199828,595.0,1050000.0,2020,12,21
...,...,...,...,...,...,...,...,...,...,...,...,...
265263,THE VISTANA CONDO,CONDOMINIUM,FREEHOLD,1,2,1000.0,1000.000000,460.0,460000.0,2015,12,15
265264,DESA VIEW TOWER,APARTMENT,FREEHOLD,1,4,898.0,898.000000,98.0,88000.0,2001,8,8
265266,IDAMAN PUTERI,CONDOMINIUM,FREEHOLD,1,3,1454.0,1454.000000,150.0,218025.0,2005,1,10
265267,KELAB LE CHATEAU II,CONDOMINIUM,FREEHOLD,1,3,593.0,593.000000,194.0,115000.0,2008,2,25


## Extrapolating economic indicators

Due to the insufficient data period of the economic indicators, the data will need to be extrapolated, as noted in the United Nation's working paper titled "A weighted extrapolation method for measuring the SDGs progress" (Nia, 2017). Based on discovered literature, there are two main methods:
1. Time-weighted extrapolation (Nia, 2017)
2. Spline extrapolation (International Monetary Fund, 2023)

In this study, we use the time-weighted extrapolation as proposed by Nia (2017), despite the Spline extrapolation being used in the Malaysian context by IMF in the 2023 press release. The reasons are:
1. The press release did not specify the implementation details of the Spline extrapolation, which makes it difficult to replicate the results.
2. In the press release, it was stated that, quote "We applied Spline extrapolation to quarterly data to obtain monthly figures.". However, using quarterly data (sparser) to obtain monthly figures (less sparse than quarterly data) is interpolation, instead.
3. Nia (2017) included its implementation details in the working paper, which makes it easier to replicate the results.