In [1]:
import pandas as pd
import numpy as np

In [2]:
equip = pd.read_excel('Data\\equip-balanced-growth-super.xlsx', header=None)

In [3]:
# Extract the option name from the first cell
option_name = equip.iloc[0,0].split("for ", 1)[1].split(" (", 1)[0]  #  get text after 'for ' and before ' ('

In [4]:
# Retrieve Table 1 from the Excel file

# Locate table 1 text
table1_row = equip.index[equip.apply(lambda row: row.astype(str).str.contains("Table 1").any(), axis=1)][0]
table1_row

table1_col = None
for col in equip.columns:
    if equip[col].astype(str).str.contains("Table 1").any():
        table1_col = col
        break
(table1_row,table1_col)

(2, 0)

In [5]:
table_start_row = table1_row + 2 # Start after the header row
table_end_col = equip.iloc[table_start_row].index[equip.iloc[4].isna()][0]

In [6]:
asset_table = equip.iloc[table_start_row:, :table_end_col]

In [8]:
asset_table.columns = asset_table.iloc[0].str.strip()  # Set the first row as header
asset_table = asset_table[1:]  # Remove the header row from the data

In [9]:
# Concat asset class col with listing col to have this form: Listing type + Asset class where listing type is not none
asset_table['Asset class'] = asset_table['Listing type'].fillna('') + ' ' + asset_table['Asset class'].fillna('')
asset_table['Asset class'] = asset_table['Asset class'].str.strip()  # Remove leading/trailing spaces
asset_table['Asset class'] = asset_table['Asset class'].replace('', np.nan)

# remove listing type col
asset_table = asset_table.drop(columns=['Listing type'])

In [10]:
# Convert Type column to boolean, 0 where Internal, 1 otherwise
asset_table['Type'] = asset_table['Type'].apply(lambda x: 0 if x == 'Internal' else 1)

In [11]:
asset_table.columns

Index(['Asset class', 'Type',
       'Name of institution / issuer / counterparty / manager / investment item',
       'Currency', 'Security identifier', '% ownership', 'Address',
       'Units held', 'Value (AUD)', 'Weighting (%)'],
      dtype='object', name=4)

In [12]:
# change the column names to match the all_cols_order template
col_map = {
    'Asset class': 'Asset Class Name',
    'Type': 'Int/Ext',
    'Name of institution / issuer / counterparty / manager / investment item': 'Name/Kind of Investment Item',
    'Security identifier': 'Stock ID',
    '% ownership': '% Ownership',
    'Units held': 'Units Held',
}

all_cols_order = [
    "Effective Date",
    "Fund Name",
    "Option Name",
    "Asset Class Name",
    "Int/Ext",
    "Name/Kind of Investment Item",
    "Currency",
    "Stock ID",
    "Listed Country",
    "Units Held",
    "% Ownership",
    "Address",
    "Value (AUD)",
    "Weighting (%)"
]

In [13]:
# map the columns in each subtable accirding to col_map
asset_table = asset_table.rename(columns=col_map)

In [14]:
mask = asset_table['Asset Class Name'] == "Total"
# Move "Total" to Name/Kind of Investment Item, and rename it to "Sub Total"
asset_table.loc[mask, 'Name/Kind of Investment Item'] = "Sub Total"
# Fill Asset class & Int/Ext with the value above
asset_table.loc[mask, 'Asset Class Name'] = asset_table['Asset Class Name'].shift(1)[mask]
asset_table.loc[mask, 'Int/Ext'] = asset_table['Int/Ext'].shift(1)[mask]

In [15]:
unique_ac = np.unique(asset_table['Asset Class Name'])
if "Total Investment Items" in unique_ac:
    # remove row with value "Total Investment Items" under Asset Class Name
    asset_table = asset_table[asset_table['Asset Class Name'] != "Total Investment Items"]

In [16]:
# Add 3 columns to the front
asset_table['Effective Date'] = '2024-12-31'  ### need to think of a way to make this dynamic without info in the original file
asset_table['Fund Name'] = "Equip Super Fund"
asset_table['Option Name'] = option_name  # Assuming a fixed option name for all entries
# Listed country is first 2 letters of the stock id col
asset_table['Listed Country'] = asset_table['Stock ID'].str[:2]  # Extract first 2 letters of Stock ID
# Weighting (%) need to * 100 to be in percentage
asset_table['Weighting (%)'] = asset_table['Weighting (%)'] * 100

# Reorder columns
asset_table = asset_table[all_cols_order]

In [17]:
# save to CSV
asset_table.to_csv('Data\\equip_cleaned.csv', index=False)