# Data Extraction

## DOSM Vehicle Registration Data

https://data.gov.my/data-catalogue/registration_transactions_car

In [3]:
import pandas as pd
import os

# Get the current working directory
cwd = os.getcwd()

# Append /data to cwd and define the target directory
TARGET_DIR = os.path.join(cwd, 'data')
os.makedirs(TARGET_DIR, exist_ok=True)

# Define URLs and corresponding file names
datasets = {
    'cars_2022.parquet': 'https://storage.data.gov.my/transportation/cars_2022.parquet',
    'cars_2023.parquet': 'https://storage.data.gov.my/transportation/cars_2023.parquet',
    'cars_2024.parquet': 'https://storage.data.gov.my/transportation/cars_2024.parquet',
    'cars_2025.parquet': 'https://storage.data.gov.my/transportation/cars_2025.parquet'
}

# Loop through each dataset
for file_name, url in datasets.items():
    print(f"Processing {file_name}...")
    target_path = os.path.join(TARGET_DIR, file_name)

    # Read the parquet file from URL
    df = pd.read_parquet(url)

    # Convert 'date_reg' to datetime if it exists
    if 'date_reg' in df.columns:
        df['date_reg'] = pd.to_datetime(df['date_reg'], errors='coerce', format='%Y-%m-%d')

    # Save to local parquet file
    df.to_parquet(target_path, engine='fastparquet', index=True)
    print(f"Saved: {target_path}")

print("\nAll files processed and saved.")


Processing cars_2022.parquet...
Saved: /Users/gary/Documents/Playground/automotive-bursa/data/cars_2022.parquet
Processing cars_2023.parquet...
Saved: /Users/gary/Documents/Playground/automotive-bursa/data/cars_2023.parquet
Processing cars_2024.parquet...
Saved: /Users/gary/Documents/Playground/automotive-bursa/data/cars_2024.parquet
Processing cars_2025.parquet...
Saved: /Users/gary/Documents/Playground/automotive-bursa/data/cars_2025.parquet

All files processed and saved.


## Bursa Automotive Sector Companies (Quarterly Results)

1. DRB-HICOM Berhad (1619)
2. Sime UMW (4588)
3. Bermaz Auto Berhad (5248)
4. Sime Darby (4197)
5. Tan Chong Motor Holdings (4405)

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import pandas as pd
import os

# Setup headless Selenium Chrome driver
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(), options=options)

# List of stock codes and names
stocks = {
    "1619": "DRB-HICOM Berhad",
    "4588": "Sime UMW",
    "5248": "Bermaz Auto Berhad",
    "4197": "Sime Darby",
    "4405": "Tan Chong Motor Holdings"
}

# Base URL
base_url = "https://klse.i3investor.com/web/stock/financial-quarter/"

# Output folder
output_folder = "data/quarterly_financials"
os.makedirs(output_folder, exist_ok=True)

# Iterate through each stock
for code, name in stocks.items():
    url = base_url + code
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    table = soup.find("table", {"id": "dttable-fin-quarter"})
    
    clean_name = name.replace(" ", "_").replace("&", "and")  # Safe for filenames
    
    if table:
        try:
            df = pd.read_html(str(table))[0]
            
            # Save as CSV
            csv_path = os.path.join(output_folder, f"{clean_name}.csv")
            df.to_csv(csv_path, index=False)
            
            # Save as Parquet
            parquet_path = os.path.join(output_folder, f"{clean_name}.parquet")
            df.to_parquet(parquet_path, index=False)
            
            print(f"Saved {name} to CSV and Parquet.")
        except ValueError:
            print(f"{name}: Table found but failed to parse.")
    else:
        print(f"{name}: No table found.")

driver.quit()


  df = pd.read_html(str(table))[0]


Saved DRB-HICOM Berhad to CSV and Parquet.


  df = pd.read_html(str(table))[0]


Saved Sime UMW to CSV and Parquet.


  df = pd.read_html(str(table))[0]


Saved Cycle & Carriage Bintang Berhad to CSV and Parquet.


  df = pd.read_html(str(table))[0]


Saved Bermaz Auto Berhad to CSV and Parquet.


  df = pd.read_html(str(table))[0]


Saved Sime Darby to CSV and Parquet.


  df = pd.read_html(str(table))[0]


Saved Tan Chong Motor Holdings to CSV and Parquet.


## Bursa Automotive Sector Companies (Stock Prices)

1. DRB-HICOM Berhad (1619)
2. Sime UMW (4588)
3. Bermaz Auto Berhad (5248)
4. Sime Darby (4197)
5. Tan Chong Motor Holdings (4405)

In [None]:
import yfinance as yf
import pandas as pd
import os

# Define output folder
output_folder = "data/stock_prices"
os.makedirs(output_folder, exist_ok=True)

# Ticker mapping
companies = {
    "DRB-HICOM Berhad": "1619.KL",
    "Sime UMW": "4588.KL",
    "Bermaz Auto Berhad": "5248.KL",
    "Sime Darby": "4197.KL",
    "Tan Chong Motor Holdings": "4405.KL"
}

# Date range
start_date = "2023-01-01"
end_date = "2025-12-31"

# Download and save
for name, ticker in companies.items():
    try:
        df = yf.download(ticker, start=start_date, end=end_date, progress=False)
        if not df.empty:
            clean_name = name.replace(" ", "_").replace("&", "and")

            # Save as CSV
            df.to_csv(os.path.join(output_folder, f"{clean_name}.csv"))

            # Save as Parquet
            df.to_parquet(os.path.join(output_folder, f"{clean_name}.parquet"))

            print(f"{name} data saved.")
        else:
            print(f"No data found for {name} ({ticker})")
    except Exception as e:
        print(f"Error for {name} ({ticker}): {e}")



1 Failed download:
['4588.KL']: YFTzMissingError('possibly delisted; no timezone found')


DRB-HICOM Berhad data saved.
No data found for Sime UMW (4588.KL)
Bermaz Auto Berhad data saved.
Sime Darby data saved.
Tan Chong Motor Holdings data saved.


### SIME UMW delisted since Feb 2025
- Downloaded Manually through investing.com (https://www.investing.com/equities/umw-holdings-bhd-historical-data)