In [17]:
import pandas as pd
import numpy as np

In [18]:
hostplus = pd.read_csv('Data\\hostplus.csv', header=None, encoding='latin1')

In [19]:
# 0. Retrieve fund nae and option name
fund_name = hostplus.iloc[0, 0]
option_name = hostplus.iloc[1, 0]

# 1. Retrieve asset class indices/rows
# this retrieves index of all asset classes except Cash - retrieve Cash ind manually
cash_idx = hostplus.index[hostplus[0] == "Cash"].tolist()[0]
total_idx = hostplus.index[hostplus[0] == "Total"].tolist()
ac_idx = [cash_idx] + [x + 1 for x in total_idx]
    
# total investment items line to cut off
total_inv_idx = hostplus.index[hostplus[0] == "Total Investment Items"][0]
ac_idx = [x for x in ac_idx if x < total_inv_idx]
ac_names = hostplus.iloc[ac_idx, 0].tolist()

In [20]:
ac_names

['Cash',
 'Fixed Income',
 'Listed Equity',
 'Unlisted Equity',
 'Unlisted Property',
 'Unlisted Property',
 'Unlisted Infrastructure',
 'Unlisted Alternatives']

In [21]:
# 2. Int/Ext columns
# ext_row = hostplus.index[hostplus[0].str.contains("external", case=False, na=False)].tolist()
int_row = hostplus.index[hostplus[0].str.contains("internal", case=False, na=False)].tolist()

label_map = [None] * len(ac_idx)
# 2.1. Match asset class names to int rows
for i, label_idx in enumerate(int_row):
    diff = np.repeat(label_idx,len(ac_idx)) - ac_idx
    needed_ind = np.where(abs(diff) <= 3)[0][0]
    label_map[needed_ind] = 0   # 0 for internal
# fill the rest with 1 for external
label_map = [x if x is not None else 1 for x in label_map]

In [22]:
# 3. Create multi tables for each asset class
subtables = []
for i in range(len(ac_idx) - 1):
    start = ac_idx[i]
    end = ac_idx[i + 1]
    subtable = hostplus.iloc[start:end].copy()
    subtables.append(subtable)
# For the last subtable (from last ac_idx to end of section of DataFrame)
last_subtable = hostplus.iloc[ac_idx[-1]:total_inv_idx].copy()
subtables.append(last_subtable)



In [23]:
# check
subtables[7]

Unnamed: 0,0,1,2,3,4
2524,Unlisted Alternatives,,,,
2525,Investment in non?associated entities;,,,,
2526,Held directly or by associated entities or by ...,,,,
2527,Externally managed,,,,
2528,Name of Fund Manager,,,Value (AUD),Weighting (%)
2529,Apollo Global Management,,,149083208,0.18%
2530,Apostle Funds Management,,,30928531,0.04%
2531,Athora Holding Ltd,,,291245109,0.36%
2532,Blackstone and Co,,,982965499,1.20%
2533,Blue Ocean Equities,,,5624850,0.01%


In [24]:
for i_table, table in enumerate(subtables):
    # 3.1. Create a new column for asset class name based on the subtable's first row
    asset_class_name = table.iloc[0, 0]
    # 3.2. Reset the index to start from 0
    table = table.reset_index(drop=True)

    # 3.3. The table will start from where the row with "Name" is located
    name_row = table[0].index[table[0].str.contains("name", case=False, na=False)][0]
    table = table.iloc[name_row:]
    # 3.4. Set the first row as the header
    table.columns = table.iloc[0]
    table = table[1:].reset_index(drop=True)

    # 3.5. Create new columns for asset class name and Int/Ext label
    table['Asset Class Name'] = asset_class_name
    table['Int/Ext'] = label_map[i_table]
    # 3.6. Delete nan columns
    table = table.dropna(axis=1, how='all')
    # 3.7. Update the subtables list with the cleaned table
    subtables[i_table] = table


In [25]:
# check
subtables[2]

1,Name/kind of investment item,Security Identifier,Units held,Value (AUD),Weighting (%),Asset Class Name,Int/Ext
0,3I GROUP PLC ORD,B1YW440,596231,42983246,0.05%,Listed Equity,1
1,3M CO COM,2595708,5430,1132142,0.00%,Listed Equity,1
2,3PEAK INC-A,CNE1000042T2 (ISIN),16607,337952,0.00%,Listed Equity,1
3,A P MOLLER - MAERSK A/S-CL A 'A'DKK1000,4253059,34,88815,0.00%,Listed Equity,1
4,A2 MILK CO LTD ATM,BWSRTS7,8555117,49448576,0.06%,Listed Equity,1
...,...,...,...,...,...,...,...
2302,ZOZO INC,B292RC1,1710,86082,0.00%,Listed Equity,1
2303,ZSCALER INC,BZ00V34,1051,306226,0.00%,Listed Equity,1
2304,ZTE CORP-A,BD5CPY0,532023,4728585,0.01%,Listed Equity,1
2305,ZURICH INSURANCE GROUP AG,5983816,1404,1348218,0.00%,Listed Equity,1


In [26]:
# check all column names across all subtables
all_cols = []
for table in subtables:
    all_cols.extend(table.columns.tolist())
all_cols = list(set(all_cols))  # unique column names
all_cols.sort()
all_cols

['% of property held',
 'Address',
 'Asset Class Name',
 'Currency',
 'Int/Ext',
 'Name of Fund Manager',
 'Name of Institution',
 'Name/kind of investment item',
 'Security Identifier',
 'Units held',
 'Value (AUD)',
 'Weighting (%)']

In [27]:
col_map = {     # in the form of "Original Name": "New Name"
    "Name of Fund Manager": "Name/Kind of Investment Item",
    'Name of Institution': "Name/Kind of Investment Item",
    'Name/kind of investment item': "Name/Kind of Investment Item",
    'Security Identifier': 'Stock ID',
    'Units held': 'Units Held',
    "% of property held": '% Ownership'
}

### need a check whether the column exists in the DataFrame, all columns are legit (will provide a legit list for this)
all_cols_order = [
    "Effective Date",
    "Fund Name",
    "Option Name",
    "Asset Class Name",
    "Int/Ext",
    "Name/Kind of Investment Item",
    "Currency",
    "Stock ID",
    "Listed Country",
    "Units Held",
    "% Ownership",
    "Address",
    "Value (AUD)",
    "Weighting (%)"
]

In [28]:
# map the columns in each subtable accirding to col_map
df_renamed = [df.rename(columns=col_map) for df in subtables]

In [29]:
# check
df_renamed[4]

3,Name/Kind of Investment Item,Address,% Ownership,Value (AUD),Weighting (%),Asset Class Name,Int/Ext
0,Hostplus Commercial Trust,"Levels 1, 9 & 10, 270 Adelaide Street, Brisba...",100.00%,3695178.0,0.00%,Unlisted Property,0
1,Hostplus Residential PropertyTrust,"173 -175 Phillip Street, Sydney NSW",90.00%,,0.00%,Unlisted Property,0
2,Hostplus Residential PropertyTrust,"54 Bracks St, North Fremantle WA",60.00%,,0.00%,Unlisted Property,0
3,Hostplus Residential PropertyTrust,"64 Peel Street & 9 Cordella St, South Brisbane...",60.00%,,0.00%,Unlisted Property,0
4,Hostplus Residential PropertyTrust,"Helensvale, Helensvale QLD",50.00%,,0.00%,Unlisted Property,0
5,Hostplus Residential PropertyTrust,"Moonee Valley, Moonee Valley VIC",72.00%,,0.00%,Unlisted Property,0
6,Hostplus Residential PropertyTrust,,100.00%,71406391.0,0.09%,Unlisted Property,0
7,Total,,,75101569.0,0.09%,Unlisted Property,0


In [33]:
# Concat all subtables into one DataFrame
combined_df = pd.concat(df_renamed, ignore_index=True, join='outer')
# Add 3 columns to the front
combined_df['Effective Date'] = '2024-12-31'  ### need to think of a way to make this dynamic without info in the original file
combined_df['Fund Name'] = fund_name
combined_df['Option Name'] = option_name  # Assuming a fixed option name for all entries
combined_df['Listed Country'] = None  ### Placeholder for Listed Country, to be extracted later
# Reorder columns
combined_df = combined_df[all_cols_order]

In [34]:
# change "Total" to "Sub Total"
combined_df['Name/Kind of Investment Item'] = combined_df['Name/Kind of Investment Item'].replace('Total', 'Sub Total')

In [35]:
# Remove % sign and convert to float for Weighting (%)
combined_df['Weighting (%)'] = (
    combined_df['Weighting (%)']
    .str.replace('%', '', regex=False)
    .astype(float)
)

In [40]:
# check type of values in Value (AUD) column
combined_df['Value (AUD)'] = (
    combined_df['Value (AUD)']
    .str.replace(',', '', regex=False)  # Remove commas for conversion
    .astype(float)
)

In [42]:
# Save to CSV
combined_df.to_csv('Data\\hostplus_cleaned.csv', index=False)