#### Import libraries & define functions

In [10]:
from openpyxl import Workbook, load_workbook
import pandas as pd

dict_fs_items = {'Revenue':[],
              'Operational expenditure':[],
              'Adjustments':[],
              'Change in reserve account':[],
              'Capital expenditure':[],
              'Funding':[],
              'Require balance and DSRA movement':[],
              'Shareholder loan':[],
              'Equity':[],
              'Cash balance':[],
              'Operating profit':[],
              'Depreciation & Amortization':[],
              'Interest (expense)/ income and decommissioning':[],
              'Tax expense':[],
              'Distributions':[],
              'Retained earning balance':[]}

# /Users/filiptomanka/Programming/excel_coding/transform_data.ipynb

def get_cols_dict(
        df, 
        row_index = 3
        ) -> dict:
    """ 
    Returns dictionary of datetime columns and their column indexes.

    """

    row_values = df.iloc[row_index]
    dict_column_indexes = {}

    for column_index, value in enumerate(row_values):
        value_type = type(value).__name__
        if value_type == 'datetime':

            if value in dict_column_indexes.keys():
                dict_column_indexes[value].append(column_index)

            else:
                dict_column_indexes[value] = [column_index]


    return dict_column_indexes

def get_rows_dict(df, 
                  col_index = 1
                  ) -> dict:
    """ 
    Returns dictionary of FS items and their row indexes.

    """


    dict_rows = {}

    for row_index, rows in df.iterrows():
        
        fs_item = rows[col_index]
        if fs_item in dict_fs_items.keys():
            
            if fs_item in dict_rows.keys():
                dict_rows[fs_item].append(row_index)
            else:
                dict_rows[fs_item] = [row_index]

    return dict_rows

def create_output_df(df, dict_rows, dict_cols, file_name ='testing_model') -> pd.DataFrame:
    """
    Returns output dataframe containing FS data to be used in later analysis.
    """

    dict_output = {'entity':[],
                'fs_item':[],
                'fs_sub_item':[],
                'date':[],
                'value':[]}

    for row_name, row_indexes in dict_rows.items():
        for col_name, col_indexes in dict_cols.items():
            for row_index in row_indexes:
                for col_index in col_indexes:

                    data = df.iloc[row_index, col_index]
                    dict_output['entity'].append(file_name)
                    dict_output['fs_item'].append(row_name)
                    dict_output['fs_sub_item'].append(df.iloc[row_index, 2])
                    dict_output['date'].append(col_name)
                    dict_output['value'].append(data)

    df_output = pd.DataFrame(dict_output)
    df_output.dropna(subset=['value'], inplace=True)

    return df_output

#### Processing of input excel

In [8]:
# Specify the path to your Excel file
excel_file = 'data/exported_gpc_model_1.xlsx'

# Load the specified sheet into a Pandas dataframe
df = pd.read_excel(excel_file, sheet_name = 0, index_col= None, header = None)

# Fill values in column B
df[1] = df[1].fillna(method='ffill')

dict_rows = get_rows_dict(df)
dict_cols = get_cols_dict(df)
df_output = create_output_df(df, dict_rows, dict_cols)
df_output.to_excel('output.xlsx', index=False)     


#### Processing of all excel files in data folder

In [12]:
import os

folder_path = 'data'

dict_output = {'entity':[],
            'fs_item':[],
            'fs_sub_item':[],
            'date':[],
            'value':[]}

df_output = pd.DataFrame(dict_output)

# Loop through files in the folder
for file_name in os.listdir(folder_path):
    if file_name.startswith('exported') and file_name.endswith('.xlsx'):

        print(f'Processing file {file_name}')

        # Construct the full file path
        file_path = os.path.join(folder_path, file_name)

        # Load the specified sheet into a Pandas dataframe
        df = pd.read_excel(file_path, sheet_name=0, index_col=None, header = None)

        # Fill values in column B
        df[1] = df[1].fillna(method='ffill')

        dict_rows = get_rows_dict(df)
        dict_cols = get_cols_dict(df)
        df_temp = create_output_df(df, dict_rows, dict_cols, file_name)
        df_output = pd.concat([df_output, df_temp], ignore_index=True)

        
df_output.to_excel('output.xlsx', index=False)   

Processing file exported_gpc_model_2.xlsx
Processing file exported_abrakadabra_model_2 copy.xlsx
Processing file exported_gpc_model_1.xlsx


# Notes

Example of formating the data

In [124]:
keys = list(dict_cols.keys())
print(keys[0])
formatted_string = keys[0].strftime("%d.%m.%Y")
print(formatted_string)

2022-12-31 00:00:00
31.12.2022
