Preprocess Bermax Auto Berhad

In [66]:
import pandas as pd
import yaml

bermaz_stock_code = 5248
# Load data
cars_2022 = pd.read_parquet("data/cars_2022.parquet")  # or .csv if converted
cars_2023 = pd.read_parquet("data/cars_2023.parquet")  # or .csv if converted
cars_2024 = pd.read_parquet("data/cars_2024.parquet")  # or .csv if converted
cars_2025 = pd.read_parquet("data/cars_2025.parquet")  # or .csv if converted

# Merge the three files into a single DataFrame
cars_df = pd.concat([cars_2022, cars_2023, cars_2024, cars_2025], ignore_index=True)

bermaz_df = pd.read_csv("data/quarterly_financials/Bermaz_Auto_Berhad.csv")
bermaz_df.columns = bermaz_df.iloc[0]
# Remove the first row since it is now the header
bermaz_df = bermaz_df.iloc[1:].reset_index(drop=True)

# # load yaml file data
with open("company_brand.yaml", "r") as file:
  yaml_data = yaml.safe_load(file)
bermaz_brands = yaml_data[bermaz_stock_code]

bermaz_df

Unnamed: 0,Ann. Date,Quarter,Revenue,PBT,NP,NP to SH,NP Margin,ROE,EPS,DPS,...,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
0,Financial Year: 30-Apr-2025,Financial Year: 30-Apr-2025,Financial Year: 30-Apr-2025,Financial Year: 30-Apr-2025,Financial Year: 30-Apr-2025,Financial Year: 30-Apr-2025,Financial Year: 30-Apr-2025,Financial Year: 30-Apr-2025,Financial Year: 30-Apr-2025,Financial Year: 30-Apr-2025,...,Financial Year: 30-Apr-2025,Financial Year: 30-Apr-2025,Financial Year: 30-Apr-2025,Financial Year: 30-Apr-2025,Financial Year: 30-Apr-2025,Financial Year: 30-Apr-2025,Financial Year: 30-Apr-2025,Financial Year: 30-Apr-2025,Financial Year: 30-Apr-2025,Financial Year: 30-Apr-2025
1,12-Mar-2025,31-Jan-2025,602080,34832,26227,24144,4.36%,3.71%,2.07,1.75,...,,,,,,,,,,
2,12-Dec-2024,31-Oct-2024,646861,57050,42711,40348,6.60%,5.41%,3.45,10.00,...,,,,,,,,,,
3,11-Sep-2024,31-Jul-2024,846180,97769,75378,70217,8.91%,9.40%,6.01,3.50,...,,,,,,,,,,
4,Financial Year: 30-Apr-2024,Financial Year: 30-Apr-2024,Financial Year: 30-Apr-2024,Financial Year: 30-Apr-2024,Financial Year: 30-Apr-2024,Financial Year: 30-Apr-2024,Financial Year: 30-Apr-2024,Financial Year: 30-Apr-2024,Financial Year: 30-Apr-2024,Financial Year: 30-Apr-2024,...,Financial Year: 30-Apr-2024,Financial Year: 30-Apr-2024,Financial Year: 30-Apr-2024,Financial Year: 30-Apr-2024,Financial Year: 30-Apr-2024,Financial Year: 30-Apr-2024,Financial Year: 30-Apr-2024,Financial Year: 30-Apr-2024,Financial Year: 30-Apr-2024,Financial Year: 30-Apr-2024
5,11-Jun-2024,30-Apr-2024,937525,130570,99583,90224,10.62%,10.99%,7.73,11.75,...,,,,,,,,,,
6,13-Mar-2024,31-Jan-2024,896505,95069,74525,70503,8.31%,9.00%,6.04,4.25,...,,,,,,,,,,
7,12-Dec-2023,31-Oct-2023,1006951,122834,95721,90096,9.51%,11.71%,7.73,5.00,...,,,,,,,,,,
8,12-Sep-2023,31-Jul-2023,1089278,140667,107901,100219,9.91%,13.61%,8.59,5.00,...,,,,,,,,,,
9,Financial Year: 30-Apr-2023,Financial Year: 30-Apr-2023,Financial Year: 30-Apr-2023,Financial Year: 30-Apr-2023,Financial Year: 30-Apr-2023,Financial Year: 30-Apr-2023,Financial Year: 30-Apr-2023,Financial Year: 30-Apr-2023,Financial Year: 30-Apr-2023,Financial Year: 30-Apr-2023,...,Financial Year: 30-Apr-2023,Financial Year: 30-Apr-2023,Financial Year: 30-Apr-2023,Financial Year: 30-Apr-2023,Financial Year: 30-Apr-2023,Financial Year: 30-Apr-2023,Financial Year: 30-Apr-2023,Financial Year: 30-Apr-2023,Financial Year: 30-Apr-2023,Financial Year: 30-Apr-2023


In [67]:
import itertools
import re
from datetime import datetime

def create_fiscal_quarter_mapper(financial_data_path):
    """
    Creates a function that maps dates to fiscal quarters based on a company's financial data.
    
    Args:
        financial_data_path: Path to the financial data CSV file with fiscal year markers
    
    Returns:
        A function that takes a date and returns the fiscal year and quarter
    """
    # Read the financial data
    df = pd.read_csv(financial_data_path)
    
    # Extract fiscal year information
    fiscal_years = {}
    current_fiscal_year = None
    fiscal_year_pattern = r"Financial Year: (\d{2}-[A-Za-z]+-\d{4})"
    
    for index, row in df.iterrows():
        # Check any column that might contain the fiscal year marker
        match = re.search(fiscal_year_pattern, row["Date"])
        if match:
            current_fiscal_year = datetime.strptime(match.group(1), '%d-%b-%Y')
            fiscal_years[current_fiscal_year] = []
            continue
        
        if current_fiscal_year:
            current_quarter_date = datetime.strptime(row.get('Date.1'), '%d-%b-%Y')
            
            # Calculate difference in months
            month_diff = (current_fiscal_year.year - current_quarter_date.year) * 12 + (current_fiscal_year.month - current_quarter_date.month)
            
            # Determine quarter based on month difference
            if month_diff == 0:  # Same month as fiscal year end
                qtr = 4
            elif month_diff == 3:  # 3 months before fiscal year end
                qtr = 3
            elif month_diff == 6:  # 6 months before fiscal year end
                qtr = 2
            elif month_diff == 9:  # 9 months before fiscal year end
                qtr = 1
            else:
                qtr = 0  # Error or special case
                
            # Store the quarter date with its fiscal quarter info
            quarter_info = {
                'quarter date': current_quarter_date,
                'fiscal_year': current_fiscal_year,
                'quarter': qtr
            }
            fiscal_years[current_fiscal_year].append(quarter_info)
    return fiscal_years

# Usage example
bermaz_mapper = create_fiscal_quarter_mapper('data/quarterly_financials/Bermaz_Auto_Berhad.csv')
bermaz_mapper
# list(itertools.chain(*bermaz_mapper.values()))


{datetime.datetime(2025, 4, 30, 0, 0): [{'quarter date': datetime.datetime(2025, 1, 31, 0, 0),
   'fiscal_year': datetime.datetime(2025, 4, 30, 0, 0),
   'quarter': 3},
  {'quarter date': datetime.datetime(2024, 10, 31, 0, 0),
   'fiscal_year': datetime.datetime(2025, 4, 30, 0, 0),
   'quarter': 2},
  {'quarter date': datetime.datetime(2024, 7, 31, 0, 0),
   'fiscal_year': datetime.datetime(2025, 4, 30, 0, 0),
   'quarter': 1}],
 datetime.datetime(2024, 4, 30, 0, 0): [{'quarter date': datetime.datetime(2024, 4, 30, 0, 0),
   'fiscal_year': datetime.datetime(2024, 4, 30, 0, 0),
   'quarter': 4},
  {'quarter date': datetime.datetime(2024, 1, 31, 0, 0),
   'fiscal_year': datetime.datetime(2024, 4, 30, 0, 0),
   'quarter': 3},
  {'quarter date': datetime.datetime(2023, 10, 31, 0, 0),
   'fiscal_year': datetime.datetime(2024, 4, 30, 0, 0),
   'quarter': 2},
  {'quarter date': datetime.datetime(2023, 7, 31, 0, 0),
   'fiscal_year': datetime.datetime(2024, 4, 30, 0, 0),
   'quarter': 1}],
 da

In [68]:
all_quarters = sorted(itertools.chain(*bermaz_mapper.values()), key=lambda x: x['quarter date'])
def map_fiscal_quarter(date_reg):
    for quarter_info in all_quarters:
        if quarter_info['quarter date'] >= date_reg:
            return quarter_info['quarter date']
    return None

# Map fiscal quarters to the DataFrame
cars_df['quarter'] = cars_df['date_reg'].apply(map_fiscal_quarter)

# Map distributors based on car brand
cars_df['distributor'] = cars_df['maker'].apply(lambda brand: "Bermaz" if brand in bermaz_brands else "Other")

cars_df

Unnamed: 0,date_reg,type,maker,model,colour,fuel,state,quarter,distributor
0,2022-01-01,window_van,Hyundai,Grand Starex,white,diesel,Rakan Niaga,2022-01-31,Other
1,2022-01-01,motokar,Mercedes Benz,A-Class,silver,petrol,Rakan Niaga,2022-01-31,Other
2,2022-01-01,motokar,Mercedes Benz,A-Class,white,petrol,Rakan Niaga,2022-01-31,Other
3,2022-01-01,jip,Mercedes Benz,GLB,grey,petrol,Rakan Niaga,2022-01-31,Other
4,2022-01-01,jip,Perodua,Aruz,white,petrol,Rakan Niaga,2022-01-31,Other
...,...,...,...,...,...,...,...,...,...
2696974,2025-04-30,window_van,Zeekr,009,black,electric,W.P. Kuala Lumpur,NaT,Other
2696975,2025-04-30,window_van,Zeekr,009,black,electric,W.P. Kuala Lumpur,NaT,Other
2696976,2025-04-30,window_van,Zeekr,009,black,electric,W.P. Kuala Lumpur,NaT,Other
2696977,2025-04-30,window_van,Zeekr,009,black,electric,Rakan Niaga,NaT,Other


In [69]:
quarters_to_remove = ['FY2022-Q1', 'FY2022-Q2', 'FY2022-Q3', 'FY2022-Q4', None]
distributor_to_remove = 'Other'
cars_df = cars_df[
    (~cars_df['quarter'].isin(quarters_to_remove)) & 
    (cars_df['distributor'] != distributor_to_remove)
]

# Aggregate by quarter
quarterly_registrations = cars_df.groupby(['quarter', 'distributor']).size().reset_index(name='Number of Cars')
quarterly_registrations

Unnamed: 0,quarter,distributor,Number of Cars
0,2022-01-31,Bermaz,1015
1,2022-04-30,Bermaz,4849
2,2022-07-31,Bermaz,3367
3,2022-10-31,Bermaz,3701
4,2023-01-31,Bermaz,4887
5,2023-04-30,Bermaz,5652
6,2023-07-31,Bermaz,5665
7,2023-10-31,Bermaz,5540
8,2024-01-31,Bermaz,4815
9,2024-04-30,Bermaz,5081


Clean bermaz_df row that contains "Financial Year:"

In [70]:
# Remove rows where any column in bermaz_df contains the string "Financial Year:"
bermaz_df = bermaz_df[~bermaz_df.apply(lambda row: row.astype(str).str.contains("Financial Year:").any(), axis=1)]

# Display the updated DataFrame
bermaz_df

Unnamed: 0,Ann. Date,Quarter,Revenue,PBT,NP,NP to SH,NP Margin,ROE,EPS,DPS,...,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
1,12-Mar-2025,31-Jan-2025,602080,34832,26227,24144,4.36%,3.71%,2.07,1.75,...,,,,,,,,,,
2,12-Dec-2024,31-Oct-2024,646861,57050,42711,40348,6.60%,5.41%,3.45,10.0,...,,,,,,,,,,
3,11-Sep-2024,31-Jul-2024,846180,97769,75378,70217,8.91%,9.40%,6.01,3.5,...,,,,,,,,,,
5,11-Jun-2024,30-Apr-2024,937525,130570,99583,90224,10.62%,10.99%,7.73,11.75,...,,,,,,,,,,
6,13-Mar-2024,31-Jan-2024,896505,95069,74525,70503,8.31%,9.00%,6.04,4.25,...,,,,,,,,,,
7,12-Dec-2023,31-Oct-2023,1006951,122834,95721,90096,9.51%,11.71%,7.73,5.0,...,,,,,,,,,,
8,12-Sep-2023,31-Jul-2023,1089278,140667,107901,100219,9.91%,13.61%,8.59,5.0,...,,,,,,,,,,
10,12-Jun-2023,30-Apr-2023,1072399,139533,107614,100622,10.03%,13.21%,8.63,11.0,...,,,,,,,,,,
11,13-Mar-2023,31-Jan-2023,975967,115516,92184,87288,9.45%,12.34%,7.5,4.5,...,,,,,,,,,,
12,08-Dec-2022,31-Oct-2022,782969,91589,70808,65673,9.04%,9.94%,5.65,3.5,...,,,,,,,,,,


In [71]:
# Convert bermaz_df's 'Date_Quarter' to match the format of quarterly_registrations' 'quarter'
bermaz_df['quarter'] = pd.to_datetime(bermaz_df['Quarter'], format='%d-%b-%Y')

# Merge the two DataFrames on the 'quarter' column
bermaz_df = pd.merge(
    bermaz_df,
    quarterly_registrations[['quarter', 'Number of Cars']],
    on='quarter',
    how='left'
)
# check columns that have NaN values
bermaz_df.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bermaz_df['quarter'] = pd.to_datetime(bermaz_df['Quarter'], format='%d-%b-%Y')


Ann. Date               0
Quarter                 0
Revenue                 0
PBT                     0
NP                      0
NP to SH                0
NP Margin               0
ROE                     0
EPS                     0
DPS                     0
NAPS                    0
QoQ                     0
YoY                     0
Unnamed: 13_level_1    40
Unnamed: 14_level_1    40
Unnamed: 15_level_1    40
Unnamed: 16_level_1    40
Unnamed: 17_level_1    40
Unnamed: 18_level_1    40
Unnamed: 19_level_1    40
Unnamed: 20_level_1    40
Unnamed: 21_level_1    40
Unnamed: 22_level_1    40
Unnamed: 23_level_1    40
Unnamed: 24_level_1    40
Unnamed: 25_level_1    40
Unnamed: 26_level_1    40
Unnamed: 27_level_1    40
Unnamed: 28_level_1    40
Unnamed: 29_level_1    40
Unnamed: 30_level_1    40
Unnamed: 31_level_1    40
Unnamed: 32_level_1    40
Unnamed: 33_level_1    40
Unnamed: 34_level_1    40
Unnamed: 35_level_1    40
Unnamed: 36_level_1    40
Unnamed: 37_level_1    40
quarter     

Remove columns with NaN Values

In [72]:
bermaz_df = bermaz_df.loc[:, ~bermaz_df.columns.str.startswith('Unnamed')]
bermaz_df

Unnamed: 0,Ann. Date,Quarter,Revenue,PBT,NP,NP to SH,NP Margin,ROE,EPS,DPS,NAPS,QoQ,YoY,quarter,Number of Cars
0,12-Mar-2025,31-Jan-2025,602080,34832,26227,24144,4.36%,3.71%,2.07,1.75,0.5584,-40.16%,-65.75%,2025-01-31,3151.0
1,12-Dec-2024,31-Oct-2024,646861,57050,42711,40348,6.60%,5.41%,3.45,10.0,0.6374,-42.54%,-55.22%,2024-10-31,3396.0
2,11-Sep-2024,31-Jul-2024,846180,97769,75378,70217,8.91%,9.40%,6.01,3.5,0.6398,-22.17%,-29.94%,2024-07-31,4225.0
3,11-Jun-2024,30-Apr-2024,937525,130570,99583,90224,10.62%,10.99%,7.73,11.75,0.7033,27.97%,-10.33%,2024-04-30,5081.0
4,13-Mar-2024,31-Jan-2024,896505,95069,74525,70503,8.31%,9.00%,6.04,4.25,0.6708,-21.75%,-19.23%,2024-01-31,4815.0
5,12-Dec-2023,31-Oct-2023,1006951,122834,95721,90096,9.51%,11.71%,7.73,5.0,0.6604,-10.10%,37.19%,2023-10-31,5540.0
6,12-Sep-2023,31-Jul-2023,1089278,140667,107901,100219,9.91%,13.61%,8.59,5.0,0.631,-0.40%,99.80%,2023-07-31,5665.0
7,12-Jun-2023,30-Apr-2023,1072399,139533,107614,100622,10.03%,13.21%,8.63,11.0,0.6536,15.28%,27.85%,2023-04-30,5652.0
8,13-Mar-2023,31-Jan-2023,975967,115516,92184,87288,9.45%,12.34%,7.5,4.5,0.6078,32.91%,114.41%,2023-01-31,4887.0
9,08-Dec-2022,31-Oct-2022,782969,91589,70808,65673,9.04%,9.94%,5.65,3.5,0.5688,30.93%,152.21%,2022-10-31,3701.0


In [73]:
# save the final DataFrame to a CSV file
bermaz_df.to_csv('data/quarterly_financials/Bermaz_Auto_Berhad_merge_no_of_car.csv', index=False)