In [1]:
import pandas as pd
import numpy as np

In [2]:
hesta = pd.read_csv('Data\\HESTA - Balanced-Growth-super-assets.csv', encoding='latin1')

In [3]:
# change the column names to match the all_cols_order template
col_map = {     # in the form of "Original Name": "New Name"
    'Option': 'Option Name',
    'Asset Class': 'Asset Class Name',
    'Internal/External': 'Int/Ext',
    'Name/kind of investment item': 'Name/Kind of Investment Item',
    'Security Identifier': 'Stock ID',
    '% Ownership / Property Held': '% Ownership',
    'Units': 'Units Held',
    'Weighting': 'Weighting (%)',
    'Address': 'Address', # can add col map that doesnt exist in the original data --> can concat all options in one mapping dict
}

all_cols_order = [
    "Effective Date",
    "Fund Name",
    "Option Name",
    "Asset Class Name",
    "Int/Ext",
    "Name/Kind of Investment Item",
    "Currency",
    "Stock ID",
    "Listed Country",
    "Units Held",
    "% Ownership",
    "Address",
    "Value (AUD)",
    "Weighting (%)"
]

In [4]:
# map the columns in each subtable accirding to col_map
asset_table = hesta.rename(columns=col_map)

In [5]:
# If the asset class col got "internal" in it, then its corresponding internal/external col value is internal, leave other rows unchanged
asset_table['Int/Ext'] = asset_table.apply(
    lambda row: 'internal' if 'internal' in str(row['Asset Class Name']).lower() else row['Int/Ext'], axis=1
)

# For internal/external col, if there is "internal" in the cell, then it is internal, otherwise external
asset_table['Int/Ext'] = asset_table['Int/Ext'].apply(lambda x: 0 if 'internal' in str(x).lower() else 1)


In [6]:
# check
asset_table[asset_table['Asset Class Name'].str.contains('fixed income', na=False, case=False)]

Unnamed: 0,Effective Date,Option Name,Asset Class Name,Int/Ext,Name/Kind of Investment Item,Units Held,Value (AUD),Weighting (%),% Ownership,Currency,Stock ID
136,31/12/2024,Balanced Growth,Fixed Income,1,BlackRock Inc,,9.958635e+08,0.014370,,,
137,31/12/2024,Balanced Growth,Fixed Income,1,Challenger Limited,,1.136033e+08,0.001639,,,
138,31/12/2024,Balanced Growth,Fixed Income,1,GoldenTree AM,,1.757064e+08,0.002535,,,
139,31/12/2024,Balanced Growth,Fixed Income,1,IFM Investors Pty Ltd,,1.901253e+08,0.002743,,,
140,31/12/2024,Balanced Growth,Fixed Income,1,JPMorgan Chase & Co,,2.216765e+09,0.031986,,,
...,...,...,...,...,...,...,...,...,...,...,...
207,31/12/2024,Balanced Growth,Fixed Income,0,Western Australian Treasury Corp,,5.256532e+07,0.000758,,,
208,31/12/2024,Balanced Growth,Fixed Income,0,Westpac Banking Corp,,2.068320e+07,0.000298,,,
209,31/12/2024,Balanced Growth,Fixed Income,0,Woolworths Group Ltd,,2.865198e+07,0.000413,,,
210,31/12/2024,Balanced Growth,Fixed Income External Total,1,,,7.431149e+09,0.107226,,,


In [7]:
# If the asset class col got "Total" in it, then it takes the previous value, and its corresponding value in name/kind of investment item 
# col is "Sub Total"
mask = asset_table['Asset Class Name'].str.contains("Total")
# Move "Total" to Name/Kind of Investment Item, and rename it to "Sub Total"
asset_table.loc[mask, 'Name/Kind of Investment Item'] = "Sub Total"
# Fill Asset class with the value 2 rows above
asset_table.loc[mask, 'Asset Class Name'] = asset_table['Asset Class Name'].shift(2)[mask]

In [8]:
# check
asset_table[asset_table['Asset Class Name'].str.contains('fixed income', na=False, case=False)]

Unnamed: 0,Effective Date,Option Name,Asset Class Name,Int/Ext,Name/Kind of Investment Item,Units Held,Value (AUD),Weighting (%),% Ownership,Currency,Stock ID
136,31/12/2024,Balanced Growth,Fixed Income,1,BlackRock Inc,,9.958635e+08,0.014370,,,
137,31/12/2024,Balanced Growth,Fixed Income,1,Challenger Limited,,1.136033e+08,0.001639,,,
138,31/12/2024,Balanced Growth,Fixed Income,1,GoldenTree AM,,1.757064e+08,0.002535,,,
139,31/12/2024,Balanced Growth,Fixed Income,1,IFM Investors Pty Ltd,,1.901253e+08,0.002743,,,
140,31/12/2024,Balanced Growth,Fixed Income,1,JPMorgan Chase & Co,,2.216765e+09,0.031986,,,
...,...,...,...,...,...,...,...,...,...,...,...
207,31/12/2024,Balanced Growth,Fixed Income,0,Western Australian Treasury Corp,,5.256532e+07,0.000758,,,
208,31/12/2024,Balanced Growth,Fixed Income,0,Westpac Banking Corp,,2.068320e+07,0.000298,,,
209,31/12/2024,Balanced Growth,Fixed Income,0,Woolworths Group Ltd,,2.865198e+07,0.000413,,,
210,31/12/2024,Balanced Growth,Fixed Income,1,Sub Total,,7.431149e+09,0.107226,,,


In [9]:
# Weighting column needs to be multiplied by 100 to get percentage
asset_table['Weighting (%)'] = asset_table['Weighting (%)'].astype(float) * 100

In [10]:
# Create 3 new columns: Fund Name, Address and Listed Country
asset_table['Fund Name'] = 'HESTA'
asset_table['Address'] = None  # Placeholder, can be updated with actual addresses if available

# Listed Country take the first 2 characters of the Stock ID col
asset_table['Listed Country'] = asset_table['Stock ID'].str[:2].str.upper()

In [11]:
# Reorder the columns to match the all_cols_order template
asset_table = asset_table[all_cols_order]

In [12]:
# export to CSV
asset_table.to_csv('Data\\hesta_cleaned.csv', index=False)