In [20]:
import pandas as pd
import numpy as np

In [21]:
vision = pd.read_excel('Data\\Balanced-growth_SuperandNCAPDec_24 - Vision.xlsx', header=None)

In [22]:
# retrieve fund option name
option_name = vision.iloc[1].dropna().values[0]  # option name on the second row

# 1. Retrieve asset class indices/rows
# this retrieves index of all asset classes except Cash - retrieve Cash ind manually
cash_idx = vision.index[vision[2] == "Cash"].tolist()[0]
total_idx = vision.index[vision[2] == "Total"].tolist()
ac_idx = [cash_idx] + [x + 2 for x in total_idx]
    
# total investment items line to cut off
total_inv_idx = vision.index[vision[2] == "Total Investment Items"][0]
ac_idx = [x for x in ac_idx if x < total_inv_idx]
ac_names = vision.iloc[ac_idx, 2].tolist()
ac_names_cleaned = [s.split("\n", 1)[0] for s in ac_names]

In [23]:
ac_names_cleaned

['Cash',
 'Fixed Income Internal',
 'Fixed Income External',
 'Listed Equity',
 'Unlisted Equity Internal',
 'Unlisted Equity External',
 'Listed Property',
 'Unlisted Property External',
 'Listed Infrastructure',
 'Unlisted Infrastructure External']

In [24]:
# 2. Int/Ext columns
label_map = [
    0 if "internal" in s.lower() else 1
    for s in ac_names_cleaned
]
label_map

[1, 0, 1, 1, 0, 1, 1, 1, 1, 1]

In [25]:
# 3. Create multi tables for each asset class
subtables = []
for i in range(len(ac_idx) - 1):
    start = ac_idx[i]
    end = ac_idx[i + 1] - 1
    subtable = vision.iloc[start:end].copy()
    # remove all columns and rows that are completely NaN
    subtable = subtable.dropna(axis=0, how='all')
    subtable = subtable.dropna(axis=1, how='all')
    subtables.append(subtable)
# For the last subtable (from last ac_idx to end of section or DataFrame)
last_subtable = vision.iloc[ac_idx[-1]:total_inv_idx].copy()
last_subtable = last_subtable.dropna(axis=0, how='all')
last_subtable = last_subtable.dropna(axis=1, how='all')
subtables.append(last_subtable)

In [26]:
subtables[-2] # Check the last subtable

Unnamed: 0,2,7,8,12,13,15,20
1065,Listed Infrastructure,,,,,,
1066,Name/kind of investment item,% Ownership,Security Identifier,Units held,Currency,Value (AUD),Weighting (%)
1067,APA GROUP STAPLED SECURITY AUD 0,,6247306,395991,,2760057.025,0.0004
1068,ATLAS ARTERIA LTD STAPLED SECURITY AUD,,BZ03TZ1,593114,,2817291.271,0.0004
1069,DALRYMPLE BAY INFRASTRUCTURE LTD STAPLED SECURITY,,BNM6Z56,598914,,2156089.206,0.0003
1070,TRANSURBAN GROUP STAPLED SECURITY AUD 0,,6200882,2124915,,28452608.44,0.0039
1071,Total,,,,,36186045.942,0.005


In [27]:
for i_table, table in enumerate(subtables):
    # 3.1. Create a new column for asset class name based on the subtable's first row
    asset_class_name = table.iloc[0, 0].split("\n", 1)[0]
    # 3.2. Reset the index to start from 0
    table = table.reset_index(drop=True)

    # 3.3. The table will start from where the row with "Name" is located
    name_row = table[2].index[table[2].str.contains("name", case=False, na=False)][0]
    table = table.iloc[name_row:]
    # 3.4. Set the first row as the header
    table.columns = table.iloc[0]
    table = table[1:].reset_index(drop=True)
    table['Asset Class Name'] = asset_class_name
    table['Int/Ext'] = label_map[i_table]
    # 3.5. Delete nan columns
    table = table.dropna(axis=1, how='all')
    # 3.6. Update the subtables list with the cleaned table
    subtables[i_table] = table

In [28]:
subtables[-2]

1,Name/kind of investment item,Security Identifier,Units held,Value (AUD),Weighting (%),Asset Class Name,Int/Ext
0,APA GROUP STAPLED SECURITY AUD 0,6247306,395991.0,2760057.025,0.0004,Listed Infrastructure,1
1,ATLAS ARTERIA LTD STAPLED SECURITY AUD,BZ03TZ1,593114.0,2817291.271,0.0004,Listed Infrastructure,1
2,DALRYMPLE BAY INFRASTRUCTURE LTD STAPLED SECURITY,BNM6Z56,598914.0,2156089.206,0.0003,Listed Infrastructure,1
3,TRANSURBAN GROUP STAPLED SECURITY AUD 0,6200882,2124915.0,28452608.44,0.0039,Listed Infrastructure,1
4,Total,,,36186045.942,0.005,Listed Infrastructure,1


In [29]:
# check all column names across all subtables
all_cols = []
for table in subtables:
    all_cols.extend(table.columns.tolist())
all_cols = list(set(all_cols))  # unique column names
all_cols.sort()
all_cols

['% Ownership',
 'Asset Class Name',
 'Currency',
 'Int/Ext',
 'Name of Fund Manager',
 'Name of Institution',
 'Name of Issuer/Counterparty',
 'Name/kind of investment item',
 'Security Identifier',
 'Units held',
 'Value (AUD)',
 'Weighting (%)']

In [30]:
col_map = {     # in the form of "Original Name": "New Name"
    "Name of Fund Manager": "Name/Kind of Investment Item",
    'Name of Institution': "Name/Kind of Investment Item",
    'Name of Issuer/Counterparty': "Name/Kind of Investment Item",
    'Name/kind of investment item': "Name/Kind of Investment Item",
    'Security Identifier': 'Stock ID',
    'Units held': 'Units Held',
}

### need a check whether the column exists in the DataFrame, all columns are legit (will provide a legit list for this)
all_cols_order = [
    "Effective Date",
    "Fund Name",
    "Option Name",
    "Asset Class Name",
    "Int/Ext",
    "Name/Kind of Investment Item",
    "Currency",
    "Stock ID",
    "Listed Country",
    "Units Held",
    "% Ownership",
    "Address",
    "Value (AUD)",
    "Weighting (%)"
]

In [31]:
# map the columns in each subtable accirding to col_map
df_renamed = [df.rename(columns=col_map) for df in subtables]

In [32]:
df_renamed[-2]

1,Name/Kind of Investment Item,Stock ID,Units Held,Value (AUD),Weighting (%),Asset Class Name,Int/Ext
0,APA GROUP STAPLED SECURITY AUD 0,6247306,395991.0,2760057.025,0.0004,Listed Infrastructure,1
1,ATLAS ARTERIA LTD STAPLED SECURITY AUD,BZ03TZ1,593114.0,2817291.271,0.0004,Listed Infrastructure,1
2,DALRYMPLE BAY INFRASTRUCTURE LTD STAPLED SECURITY,BNM6Z56,598914.0,2156089.206,0.0003,Listed Infrastructure,1
3,TRANSURBAN GROUP STAPLED SECURITY AUD 0,6200882,2124915.0,28452608.44,0.0039,Listed Infrastructure,1
4,Total,,,36186045.942,0.005,Listed Infrastructure,1


In [33]:
# Concat all subtables into one DataFrame
combined_df = pd.concat(df_renamed, ignore_index=True, join='outer')
# Add 3 columns to the front
combined_df['Effective Date'] = '2024-12-31'  ### need to think of a way to make this dynamic without info in the original file
combined_df['Fund Name'] = "Vision Super Fund"
combined_df['Option Name'] = option_name  # Assuming a fixed option name for all entries
combined_df['Listed Country'] = None  ### Placeholder for Listed Country, to be extracted later
combined_df["Address"] = None  # To align with the all_cols_order format
# Reorder columns
combined_df = combined_df[all_cols_order]

In [34]:
# change "Total" to "Sub Total"
combined_df['Name/Kind of Investment Item'] = combined_df['Name/Kind of Investment Item'].replace('Total', 'Sub Total')
# Weighting (%) need to * 100 to be in percentage
combined_df['Weighting (%)'] = combined_df['Weighting (%)'] * 100

In [35]:
combined_df.to_csv('Data\\vision_cleaned.csv', index=False)