# Data Extraction

## DOSM Vehicle Registration Data

https://data.gov.my/data-catalogue/registration_transactions_car

In [1]:
import pandas as pd
import os

# Get the current working directory
cwd = os.getcwd()

# Append /data to cwd and define the target directory
TARGET_DIR = os.path.join(cwd, 'data/vehicle_reg_dosm')
os.makedirs(TARGET_DIR, exist_ok=True)

# Define URLs and corresponding file names
datasets = {
    'cars_2018.parquet': 'https://storage.data.gov.my/transportation/cars_2018.parquet',
    'cars_2019.parquet': 'https://storage.data.gov.my/transportation/cars_2019.parquet',
    'cars_2020.parquet': 'https://storage.data.gov.my/transportation/cars_2020.parquet',
    'cars_2021.parquet': 'https://storage.data.gov.my/transportation/cars_2021.parquet',
    'cars_2022.parquet': 'https://storage.data.gov.my/transportation/cars_2022.parquet',
    'cars_2023.parquet': 'https://storage.data.gov.my/transportation/cars_2023.parquet',
    'cars_2024.parquet': 'https://storage.data.gov.my/transportation/cars_2024.parquet',
    'cars_2025.parquet': 'https://storage.data.gov.my/transportation/cars_2025.parquet'
}

# Loop through each dataset
for file_name, url in datasets.items():
    print(f"Processing {file_name}...")
    target_path = os.path.join(TARGET_DIR, file_name)

    # Read the parquet file from URL
    df = pd.read_parquet(url)

    # Convert 'date_reg' to datetime if it exists
    if 'date_reg' in df.columns:
        df['date_reg'] = pd.to_datetime(df['date_reg'], errors='coerce', format='%Y-%m-%d')

    # Save to local parquet file
    df.to_parquet(target_path, engine='fastparquet', index=True)
    print(f"Saved: {target_path}")

print("\nAll files processed and saved.")


Processing cars_2018.parquet...
Saved: c:\Users\gooyt\Desktop\automotive-bursa\data\cars_2018.parquet
Processing cars_2019.parquet...
Saved: c:\Users\gooyt\Desktop\automotive-bursa\data\cars_2019.parquet
Processing cars_2020.parquet...
Saved: c:\Users\gooyt\Desktop\automotive-bursa\data\cars_2020.parquet
Processing cars_2021.parquet...
Saved: c:\Users\gooyt\Desktop\automotive-bursa\data\cars_2021.parquet
Processing cars_2022.parquet...
Saved: c:\Users\gooyt\Desktop\automotive-bursa\data\cars_2022.parquet
Processing cars_2023.parquet...
Saved: c:\Users\gooyt\Desktop\automotive-bursa\data\cars_2023.parquet
Processing cars_2024.parquet...
Saved: c:\Users\gooyt\Desktop\automotive-bursa\data\cars_2024.parquet
Processing cars_2025.parquet...
Saved: c:\Users\gooyt\Desktop\automotive-bursa\data\cars_2025.parquet

All files processed and saved.


## Bursa Automotive Sector Companies (Quarterly Results)

1. DRB-HICOM Berhad (1619)
2. Sime UMW (4588)
3. Bermaz Auto Berhad (5248)
4. Sime Darby (4197)
5. Tan Chong Motor Holdings (4405)

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import pandas as pd
import os

# Setup headless Selenium Chrome driver
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(), options=options)

# List of stock codes and names
stocks = {
    "1619": "DRB-HICOM Berhad",
    "4588": "Sime UMW",
    "5248": "Bermaz Auto Berhad",
    "4197": "Sime Darby",
    "4405": "Tan Chong Motor Holdings"
}

# Base URL
base_url = "https://klse.i3investor.com/web/stock/financial-quarter/"

# Output folder
output_folder = "data/quarterly_financials"
os.makedirs(output_folder, exist_ok=True)

# Iterate through each stock
for code, name in stocks.items():
    url = base_url + code
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    table = soup.find("table", {"id": "dttable-fin-quarter"})
    
    clean_name = name.replace(" ", "_").replace("&", "and")  # Safe for filenames
    
    if table:
        try:
            df = pd.read_html(str(table))[0]
            
            # Save as CSV
            csv_path = os.path.join(output_folder, f"{clean_name}.csv")
            df.to_csv(csv_path, index=False)
            
            # Save as Parquet
            parquet_path = os.path.join(output_folder, f"{clean_name}.parquet")
            df.to_parquet(parquet_path, index=False)
            
            print(f"Saved {name} to CSV and Parquet.")
        except ValueError:
            print(f"{name}: Table found but failed to parse.")
    else:
        print(f"{name}: No table found.")

driver.quit()


  df = pd.read_html(str(table))[0]


Saved DRB-HICOM Berhad to CSV and Parquet.


  df = pd.read_html(str(table))[0]


Saved Sime UMW to CSV and Parquet.


  df = pd.read_html(str(table))[0]


Saved Cycle & Carriage Bintang Berhad to CSV and Parquet.


  df = pd.read_html(str(table))[0]


Saved Bermaz Auto Berhad to CSV and Parquet.


  df = pd.read_html(str(table))[0]


Saved Sime Darby to CSV and Parquet.


  df = pd.read_html(str(table))[0]


Saved Tan Chong Motor Holdings to CSV and Parquet.


## Bursa Automotive Sector Companies (Stock Prices)

1. DRB-HICOM Berhad (1619)
2. Sime UMW (4588)
3. Bermaz Auto Berhad (5248)
4. Sime Darby (4197)
5. Tan Chong Motor Holdings (4405)

In [None]:
import yfinance as yf
import pandas as pd
import os

# Define output folder
output_folder = "data/stock_prices"
os.makedirs(output_folder, exist_ok=True)

# Ticker mapping
companies = {
    "DRB-HICOM Berhad": "1619.KL",
    "Sime UMW": "4588.KL",
    "Bermaz Auto Berhad": "5248.KL",
    "Sime Darby": "4197.KL",
    "Tan Chong Motor Holdings": "4405.KL"
}

# Date range
start_date = "2023-01-01"
end_date = "2025-12-31"

# Download and save
for name, ticker in companies.items():
    try:
        df = yf.download(ticker, start=start_date, end=end_date, progress=False)
        if not df.empty:
            clean_name = name.replace(" ", "_").replace("&", "and")

            # Save as CSV
            df.to_csv(os.path.join(output_folder, f"{clean_name}.csv"))

            # Save as Parquet
            df.to_parquet(os.path.join(output_folder, f"{clean_name}.parquet"))

            print(f"{name} data saved.")
        else:
            print(f"No data found for {name} ({ticker})")
    except Exception as e:
        print(f"Error for {name} ({ticker}): {e}")



1 Failed download:
['4588.KL']: YFTzMissingError('possibly delisted; no timezone found')


DRB-HICOM Berhad data saved.
No data found for Sime UMW (4588.KL)
Bermaz Auto Berhad data saved.
Sime Darby data saved.
Tan Chong Motor Holdings data saved.


### SIME UMW delisted since Feb 2025
- Downloaded Manually through investing.com (https://www.investing.com/equities/umw-holdings-bhd-historical-data)

## Monthly Principal Labour Force Statistics
Monthly principal labour force statistics, including unemployment and participation rates.

https://open.dosm.gov.my/data-catalogue/lfs_month?visual=table

In [47]:
import pandas as pd

URL_DATA = 'https://storage.dosm.gov.my/labour/lfs_month.parquet'

cpi_month = pd.read_parquet(URL_DATA)
if 'date' in cpi_month.columns: cpi_month['date'] = pd.to_datetime(cpi_month['date'])

cpi_month

Unnamed: 0,date,lf,lf_employed,lf_unemployed,lf_outside,p_rate,ep_ratio,u_rate
0,2010-01-01,12367.9,11931.2,436.7,6927.5,64.1,61.8,3.5
1,2010-02-01,12059.8,11632.3,427.4,7225.1,62.5,60.3,3.5
2,2010-03-01,12324.2,11895.9,428.3,6993.0,63.8,61.6,3.5
3,2010-04-01,12512.0,12133.5,378.5,6894.5,64.5,62.5,3.0
4,2010-05-01,12197.6,11798.9,398.7,7186.0,62.9,60.9,3.3
...,...,...,...,...,...,...,...,...
178,2024-11-01,17293.8,16747.0,546.7,7226.7,70.5,68.3,3.2
179,2024-12-01,17168.4,16629.9,538.5,7156.8,70.6,68.4,3.1
180,2025-01-01,17218.2,16684.4,533.8,7154.8,70.6,68.5,3.1
181,2025-02-01,17266.9,16734.1,532.8,7153.1,70.7,68.5,3.1


In [48]:
cpi_month[(cpi_month['date'] >= '2017-01-01') & (cpi_month['date'] <= '2018-01-01')]

Unnamed: 0,date,lf,lf_employed,lf_unemployed,lf_outside,p_rate,ep_ratio,u_rate
84,2017-01-01,14880.9,14366.8,514.1,7090.4,67.7,65.4,3.4
85,2017-02-01,14916.7,14401.8,514.8,7086.9,67.8,65.4,3.4
86,2017-03-01,14932.5,14421.7,510.8,7121.8,67.7,65.4,3.4
87,2017-04-01,14941.5,14429.6,511.9,7122.5,67.7,65.4,3.4
88,2017-05-01,14961.9,14454.4,507.5,7120.8,67.8,65.5,3.4
89,2017-06-01,15027.6,14519.9,507.7,7141.1,67.8,65.5,3.4
90,2017-07-01,15016.4,14497.4,519.0,7150.4,67.7,65.4,3.5
91,2017-08-01,15030.2,14513.4,516.9,7138.4,67.8,65.5,3.4
92,2017-09-01,15058.8,14544.3,514.5,7122.1,67.9,65.6,3.4
93,2017-10-01,15090.4,14581.7,508.8,7117.7,68.0,65.7,3.4


In [None]:
cpi_month.to_parquet('data/lfs_month.parquet', index=False)

## Prices: Consumer Prices (Cars Monthly CPI)

Monthly CPI by Subclass

https://open.dosm.gov.my/data-catalogue/cpi_5d

In [6]:
import pandas as pd

URL_DATA = 'https://storage.dosm.gov.my/cpi/cpi_5d.parquet'

df = pd.read_parquet(URL_DATA)
if 'date' in df.columns: df['date'] = pd.to_datetime(df['date'])

df

Unnamed: 0,date,subclass,index
0,2010-01-01,01111,99.9
1,2010-01-01,01112,100.2
2,2010-01-01,01113,99.5
3,2010-01-01,01114,100.2
4,2010-01-01,01115,99.9
...,...,...,...
33115,2025-04-01,13211,279.0
33116,2025-04-01,13220,129.9
33117,2025-04-01,13291,84.8
33118,2025-04-01,13301,134.1


In [9]:
# Filter rows where subclass is '07111'
cars_cpi = df[df['subclass'] == '07111'].copy()

# Add new column
cars_cpi['subclass name'] = 'Cars'
cars_cpi

Unnamed: 0,date,subclass,index,subclass name
112,2010-01-01,07111,100.0,Cars
292,2010-02-01,07111,100.1,Cars
472,2010-03-01,07111,100.1,Cars
652,2010-04-01,07111,100.1,Cars
832,2010-05-01,07111,100.1,Cars
...,...,...,...,...
32332,2024-12-01,07111,94.7,Cars
32512,2025-01-01,07111,95.9,Cars
32692,2025-02-01,07111,96.8,Cars
32872,2025-03-01,07111,96.8,Cars


In [46]:
cars_cpi[(cars_cpi['date'] >= '2017-01-01') & (cars_cpi['date'] <= '2018-01-01')]

Unnamed: 0,date,subclass,index,subclass name
15232,2017-01-01,7111,98.2,Cars
15412,2017-02-01,7111,98.2,Cars
15592,2017-03-01,7111,98.6,Cars
15772,2017-04-01,7111,98.6,Cars
15952,2017-05-01,7111,98.6,Cars
16132,2017-06-01,7111,98.6,Cars
16312,2017-07-01,7111,98.6,Cars
16492,2017-08-01,7111,98.6,Cars
16672,2017-09-01,7111,98.6,Cars
16852,2017-10-01,7111,98.6,Cars


In [10]:
cars_cpi.to_parquet('data/cars_cpi_month.parquet', index=False)

## BNM Interest Rate Manual Download

https://www.bnm.gov.my/monetary-stability/opr-decisions

In [36]:
import pandas as pd
from io import StringIO

# Raw input text
raw_data = """
Date	Change in OPR (%)	New OPR Level (%)	Monetary Policy Statement
08 May 2025	0	3.00	View Statement
06 Mar 2025	0	3.00	View Statement
22 Jan 2025	0	3.00	View Statement
06 Nov 2024	0	3.00	View Statement
05 Sep 2024	0	3.00	View Statement
11 Jul 2024	0	3.00	View Statement
09 May 2024	0	3.00	View Statement
07 Mar 2024	0	3.00	View Statement
24 Jan 2024	0	3.00	View Statement
02 Nov 2023	0	3.00	View Statement
07 Sep 2023	0	3.00	View Statement
06 Jul 2023	0	3.00	View Statement
03 May 2023	+0.25	3.00	View Statement
09 Mar 2023	0	2.75	View Statement
19 Jan 2023	0	2.75	View Statement
03 Nov 2022	+0.25	2.75	View Statement
08 Sep 2022	+0.25	2.50	View Statement
06 Jul 2022	+0.25	2.25	View Statement
11 May 2022	+0.25	2.00	View Statement
03 Mar 2022	0	1.75	View Statement
20 Jan 2022	0	1.75	View Statement
03 Nov 2021	0	1.75	View Statement
09 Sep 2021	0	1.75	View Statement
08 Jul 2021	0	1.75	View Statement
06 May 2021	0	1.75	View Statement
04 Mar 2021	0	1.75	View Statement
20 Jan 2021	0	1.75	View Statement
03 Nov 2020	0	1.75	View Statement
10 Sep 2020	0	1.75	View Statement
07 Jul 2020	-0.25	1.75	View Statement
05 May 2020	-0.5	2.00	View Statement
03 Mar 2020	-0.25	2.50	View Statement
22 Jan 2020	-0.25	2.75	View Statement
05 Nov 2019	0	3.00	View Statement
12 Sep 2019	0	3.00	View Statement
09 Jul 2019	0	3.00	View Statement
07 May 2019	-0.25	3.00	View Statement
05 Mar 2019	0	3.25	View Statement
24 Jan 2019	0	3.25	View Statement
08 Nov 2018	0	3.25	View Statement
05 Sep 2018	0	3.25	View Statement
11 Jul 2018	0	3.25	View Statement
10 May 2018	0	3.25	View Statement
07 Mar 2018	0	3.25	View Statement
25 Jan 2018	+0.25	3.25	View Statement
"""

# Read the data into a DataFrame
df = pd.read_csv(StringIO(raw_data), sep='\t')

# Drop unnecessary columns and convert date
df_clean = df[['Date', 'New OPR Level (%)']].copy()
df_clean.columns = ['date', 'opr']
df_clean.head()


Unnamed: 0,date,opr
0,08 May 2025,3.0
1,06 Mar 2025,3.0
2,22 Jan 2025,3.0
3,06 Nov 2024,3.0
4,05 Sep 2024,3.0


In [37]:
df_clean['date'] = pd.to_datetime(df_clean['date'], format='%d %b %Y', dayfirst=True)
df_clean.sort_values('date', inplace=True)

# Create monthly date range from earliest to latest date
monthly_index = pd.date_range(start=df_clean['date'].min(), end=df_clean['date'].max(), freq='MS')

# Set the announcement dates as index to prepare for reindexing
df_clean.set_index('date', inplace=True)

# Reindex to include all months and forward fill OPR values
monthly_opr = df_clean.reindex(monthly_index, method='ffill').reset_index()
monthly_opr.columns = ['date', 'opr']

monthly_opr

Unnamed: 0,date,opr
0,2018-02-01,3.25
1,2018-03-01,3.25
2,2018-04-01,3.25
3,2018-05-01,3.25
4,2018-06-01,3.25
...,...,...
83,2025-01-01,3.00
84,2025-02-01,3.00
85,2025-03-01,3.00
86,2025-04-01,3.00


In [38]:
# Manually fill 2018-01-01 OPR as 3.00 in monthly_opr

row = pd.DataFrame([{'date': pd.Timestamp('2018-01-01'), 'opr': 3.00}])
monthly_opr = pd.concat([row, monthly_opr], ignore_index=True).sort_values('date').reset_index(drop=True)
monthly_opr.head()

Unnamed: 0,date,opr
0,2018-01-01,3.0
1,2018-02-01,3.25
2,2018-03-01,3.25
3,2018-04-01,3.25
4,2018-05-01,3.25


In [39]:
monthly_opr.to_csv('data/bnm-interest-rates.csv', index=False)