In [1]:
# %%
# %%
# %%
"""
Main Entry Point
"""
from pathlib import Path

import pandas as pd # type: ignore

import cdutils.pkey_sqlite # type: ignore
import cdutils.filtering # type: ignore
import cdutils.input_cleansing # type: ignore
import cdutils.cmo_append # type: ignore
import src.add_fields
import src.core_transform
import src.output_to_excel
from src._version import __version__
import src.output_to_excel_multiple_sheets

# def main(production_flag: bool=False):
#     if production_flag:
#         BASE_PATH = Path(r'\\00-DA1\Home\Share\Line of Business_Shared Services')
#         assert "prod" in __version__, (f"Cannot run in production mode without 'prod' in the __version__")
#     else:
#         BASE_PATH = Path('.')



# %%
# Get staging data from the daily deposit update. View dev section of documentation for more detail
INPUT_PATH = Path(r"\\00-da1\Home\Share\Data & Analytics Initiatives\Project Management\Data_Analytics\Daily_Deposit_Update\Production\output\DailyDeposit_staging.xlsx")
data = pd.read_excel(INPUT_PATH)

# Add portfolio key
data = cdutils.pkey_sqlite.add_pkey(data)

# Add int rate
data = src.add_fields.add_noteintrate(data)


# Custom list of minors (Business Deposits)
minors = [
    'CK24', # 1st Business Checking
    'CK12', # Business Checking
    'CK25', # Simple Business Checking
    'CK30', # Business Elite Money Market
    'CK19', # Business Money Market
    'CK22', # Business Premium Plus MoneyMkt
    'CK23', # Premium Business Checking
    'CK40', # Community Assoc Reserve
    'CD67', # Commercial Negotiated Rate
    'CD01', # 1 Month Business CD
    'CD07', # 3 Month Business CD
    'CD17', # 6 Month Business CD
    'CD31', # 1 Year Business CD
    'CD35', # 1 Year Business CD
    'CD37', # 18 Month Business CD
    'CD38', # 2 Year Business CD
    'CD50', # 3 Year Business CD
    'CD53', # 4 Year Business CD
    'CD59', # 5 Year Business CD
    'CD76', # 9 Month Business CD
    'CD84', # 15 Month Business CD
    'CD95', # Business <12 Month Simple CD
    'CD96', # Business >12 Month Simple CD
    'CK28', # Investment Business Checking
    'CK33', # Specialty Business Checking
    'CK34', # ICS Shadow - Business - Demand
    'SV06' # Business Select High Yield
]

# Filter to only business deposit accounts
data = cdutils.filtering.filter_to_business_deposits(data, minors)


# Add CMO
data = cdutils.cmo_append.append_cmo(data)


data_schema = {
    'noteintrate': float
}

data = cdutils.input_cleansing.enforce_schema(data, data_schema).copy()




# %%
# Exclude BCSB internal accounts
data = data[~data['ownersortname'].str.contains('BRISTOL COUNTY SAVINGS', case=False, na=False)].copy()

# %%
data

# %%







# %%
# %%
ASSETS_PATH = Path('./assets')

files = [f for f in ASSETS_PATH.iterdir() if f.is_file()]

assert len(files) == 1, f"Expected exactly 1 file in {ASSETS_PATH}, found {len(files)}."

file = files[0]
assert file.suffix == '.csv', f"Expected an excel file"

xaa_data = pd.read_csv(file, header=3)

# %%
# xaa_data.info()

#

# %%




# # %%
# xaa_data['Analyzed Charges (Pre-ECR)'] = xaa_data['Analyzed Charges (Pre-ECR)'].str.replace('[\$,]','',regex=True)
# xaa_data['Combined Result for Settlement Period (Post-ECR + Fee-Based Total)'] = xaa_data['Combined Result for Settlement Period (Post-ECR + Fee-Based Total)'].str.replace('[\$,]','',regex=True)

# %%
xaa_schema = {
    'Analyzed Charges':'float',
    'Combined Result for Settlement Period':'float',
    'Earnings Credit Rate':'float',
    'Debit Account Number':'str'
}
xaa_data = cdutils.input_cleansing.enforce_schema(xaa_data, xaa_schema)



# %%

from datetime import datetime, timedelta

def create_account_summary_alternative(xaa_data, date_col='cycle_date'):
    # Ensure date column is datetime
    xaa_data = xaa_data.copy()
    xaa_data[date_col] = pd.to_datetime(xaa_data[date_col])
    # Calculate cutoff date for 12 months
    max_date = xaa_data[date_col].max()
    cutoff_date = max_date - timedelta(days=365)
    # Add ranking column to identify latest month per account
    xaa_data['date_rank'] = (xaa_data
                            .groupby('Debit Account Number')[date_col]
                            .rank(method='dense', ascending=False))
    # Create flags for latest month and trailing 12 months
    xaa_data['is_latest_month'] = xaa_data['date_rank'] == 1
    xaa_data['is_trailing_12m'] = xaa_data[date_col] >= cutoff_date
    # Aggregate using conditional sums
    summary = (xaa_data
            .groupby('Debit Account Number')
            .agg({
                # Latest month aggregations
                'Analyzed Charges': [
                    lambda x: x[xaa_data.loc[x.index, 'is_latest_month']].sum(),
                    lambda x: x[xaa_data.loc[x.index, 'is_trailing_12m']].sum(),
                ],
                'Combined Result for Settlement Period': [
                    lambda x: x[xaa_data.loc[x.index, 'is_latest_month']].sum(),
                    lambda x: x[xaa_data.loc[x.index, 'is_trailing_12m']].sum()
                ],
                'Earnings Credit Rate': [
                    lambda x: x[xaa_data.loc[x.index, 'is_latest_month']].mean(),
                    lambda x: x[xaa_data.loc[x.index, 'is_trailing_12m']].mean()
                ],
                'Primary Officer Name': 'first',
                'Secondary Officer Name': 'first',
                'Treasury Officer Name': 'first'
            })
            .reset_index())
    # Flatten column names
    summary.columns = [
        'Debit Account Number',
        'Latest_Month_Analyzed_Charges',
        'Trailing_12M_Analyzed_Charges',
        'Latest_Month_Combined_Result',
        'Trailing_12M_Combined_Result',
        'Latest_Month_ECR',
        'Trailing_12M_Avg_ECR',
        'Primary_Officer_Name_XAA',
        'Secondary_Officer_Name_XAA',
        'Treasury_Officer_Name_XAA'
    ]
    # Reorder columns
    column_order = [
        'Debit Account Number',
        'Latest_Month_Analyzed_Charges',
        'Latest_Month_Combined_Result',
        'Trailing_12M_Analyzed_Charges',
        'Trailing_12M_Combined_Result',
        'Latest_Month_ECR',
        'Trailing_12M_Avg_ECR',
        'Primary_Officer_Name_XAA',
        'Secondary_Officer_Name_XAA',        
        'Treasury_Officer_Name_XAA'
    ]
    return summary[column_order]


# %%
summarized_xaa = create_account_summary_alternative(xaa_data, date_col='Cycle End Date')

# %%
summarized_xaa_schema = {
    'Primary_Officer_Name_XAA':'str',
    'Secondary_Officer_Name_XAA':'str',        
    'Treasury_Officer_Name_XAA':'str'
}
summarized_xaa = cdutils.input_cleansing.enforce_schema(summarized_xaa, summarized_xaa_schema)

# %%

# %%
summarized_xaa = summarized_xaa.rename(columns={
    'Debit Account Number':'acctnbr',

}).copy()

assert summarized_xaa['acctnbr'].is_unique, "Duplicates"




# %%





In [2]:
# %%
merged_data = pd.merge(data, summarized_xaa, on='acctnbr', how='left')

# %%

fill_na_column_list = [
    'Latest_Month_Analyzed_Charges',
    'Latest_Month_Combined_Result',
    'Trailing_12M_Analyzed_Charges',
    'Trailing_12M_Combined_Result',
    'Latest_Month_ECR',
    'Trailing_12M_Avg_ECR',
]
for item in fill_na_column_list:
    merged_data[item] = merged_data[item].fillna(0)

# %%




In [3]:



# Sort descending order of notebal
merged_data = merged_data.sort_values(by='notebal', ascending=False)

In [4]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6506 entries, 1424 to 3603
Data columns (total 32 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   acctnbr                        6506 non-null   object        
 1   effdate                        6506 non-null   datetime64[ns]
 2   mjaccttypcd                    6506 non-null   object        
 3   product                        6506 non-null   object        
 4   notebal                        6506 non-null   float64       
 5   notemtdavgbal                  6506 non-null   float64       
 6   currmiaccttypcd                6506 non-null   object        
 7   acctofficer                    6503 non-null   object        
 8   ownersortname                  6506 non-null   object        
 9   curracctstatcd                 6506 non-null   object        
 10  contractdate                   6505 non-null   datetime64[ns]
 11  ytdavgbal          

In [None]:
merged_data

In [27]:
"""
Core Transformations
"""
from typing import Dict
from pathlib import Path

import pandas as pd # type: ignore
import numpy as np # type: ignore

def group_and_summarize(df: pd.DataFrame, group_field: pd.DataFrame, sort_field: pd.DataFrame):
    """
    Groups the df by a field and sorts in descending order by the sort field. This is used for things like the Concentration of Credit/Deposit reports

    Args:
        df (pd.DataFrame)
        group_field (str): field in df that you want to group by
        sort_field (str): field in the df that you want to sort in descending order by (based on group total)

    Returns:
        df (pd.DataFrame)

    Tests/Assertions:
    - Postively assert that sort_field is numeric
    - Validate that the group_field has no null values
    - Assert df is not None
    """
    # Asserts
    assert df is not None, "df cannot be none"
    assert pd.api.types.is_numeric_dtype(df[sort_field]), f"Error: sort_field {sort_field} must be numeric"
    assert df[group_field].notnull().all(), f"Critical error: {group_field} contains null values and we are trying to group on it"

    pieces = []

    # Compute the total for each group based on sort_field and sort groups in descending order
    group_summaries = df.groupby(group_field)[sort_field].sum().sort_values(ascending=False)

    # Loop through each group
    for grp in group_summaries.index:
        grp_df = df[df[group_field] == grp]
        pieces.append(grp_df)

        # find the top ownername
        owner_total = grp_df.groupby("ownersortname")["notebal"].sum()
        top_owner = owner_total.idxmax()

        # Take the mode of the officer field
        officer_mode = grp_df["acctofficer"].mode()
        top_officer = officer_mode.iloc[0] if len(officer_mode) > 0 else ""

        # Take the mode of cash management officer
        cmo_mode = grp_df["Cash Management Officer"].mode()
        top_cmo = cmo_mode.iloc[0] if len(cmo_mode) > 0 else ""

        # Weighted Avg Rate
        if grp_df.empty or grp_df['notebal'].sum() == 0:
            weighted_avg = float('nan')
        else:
            weighted_avg = (grp_df['noteintrate'] * grp_df['notebal']).sum() / grp_df['notebal'].sum()

        # Build summary row
        summary = {}
        for col in df.columns:
            if col == group_field:
                summary[col] = grp
            elif col == "ownersortname":
                summary[col] = top_owner
            elif col == "acctofficer":
                summary[col] = top_officer
            elif col == "Cash Management Officer":
                summary[col] = top_cmo
            elif col == 'noteintrate':
                summary[col] = weighted_avg
            elif pd.api.types.is_numeric_dtype(df[col]):
                # Sum numeric columns
                summary[col] = grp_df[col].sum()
            else:
                summary[col] = ""
        pieces.append(pd.DataFrame([summary]))

        # Append a blank row
        blank_row = {col: "" for col in df.columns}
        pieces.append(pd.DataFrame([blank_row]))

    # Concatenate all pieces into one DataFrame
    result = pd.concat(pieces, ignore_index=True)
    return result

def main_pipeline(data: pd.DataFrame) -> pd.DataFrame:
    """
    Main data pipeline 
    """
    # # Set column types
    # numeric_cols = ['noteintrate','bookbalance','notebal']
    # for col in numeric_cols:
    #     df[col] = pd.to_numeric(df[col])

    cmo_col = "Cash Management Officer"

    if cmo_col not in data.columns:
        data[cmo_col] = ""
    else:
        data[cmo_col] = data[cmo_col].fillna("")


    data = data[[
        'portfolio_key',
        'acctnbr',
        'ownersortname',
        'product',
        'acctofficer',
        'Cash Management Officer',
        'notebal',
        'noteintrate',
        'contractdate',
        '3Mo_AvgBal',
        'TTM_AvgBal',
        'Year Ago Balance',
        'TTM_DAYS_OVERDRAWN',
        'TTM_NSF_COUNT',
        # 'YTD_DAYS_OVERDRAWN',
        # 'YTD_NSF_COUNT',
        'Latest_Month_Analyzed_Charges',
        'Latest_Month_Combined_Result',
        'Trailing_12M_Analyzed_Charges',
        'Trailing_12M_Combined_Result',
        'Latest_Month_ECR',
        # 'Trailing_12M_Avg_ECR',
        # 'Primary_Officer_Name_XAA',
        # 'Secondary_Officer_Name_XAA',
        # 'Treasury_Officer_Name_XAA'
    ]].copy()


    df = group_and_summarize(data, "portfolio_key", "notebal")

    df = df.rename(columns={
        'acctnbr':'Acct No.',
        'ownersortname':'Borrower Name',
        'notebal':'Current Balance',
        'noteintrate':'Interest Rate',
        'acctofficer':'Account Officer',
        'contractdate':'Acct Open Date',
        'Latest_Month_Analyzed_Charges':'Current Mo Analyzed Fees (Pre-ECR)',
        'Latest_Month_Combined_Result':'Current Mo Net Analyzed Fees (Post-ECR)',
        'Trailing_12M_Analyzed_Charges':'TTM Analyzed Fees (Pre-ECR)',
        'Trailing_12M_Combined_Result':'TTM Net Analyzed Fees (Post-ECR)',
        'Latest_Month_ECR':'Current ECR'
    }).copy()

    return df 


    






In [28]:
# This part doesn't work. Look at noteinrate, gets weird

In [29]:
formatted_data = main_pipeline(merged_data)

In [30]:
formatted_data

Unnamed: 0,portfolio_key,Acct No.,Borrower Name,product,Account Officer,Cash Management Officer,Current Balance,Interest Rate,Acct Open Date,3Mo_AvgBal,TTM_AvgBal,Year Ago Balance,TTM_DAYS_OVERDRAWN,TTM_NSF_COUNT,Current Mo Analyzed Fees (Pre-ECR),Current Mo Net Analyzed Fees (Post-ECR),TTM Analyzed Fees (Pre-ECR),TTM Net Analyzed Fees (Post-ECR),Current ECR
0,3843,151199025,CHAVES HOLDINGS INC,Business Select High Yield,RICHARD J. CLARK,,32166763.22,0.033,2025-07-02 00:00:00,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0
1,3843,151146406,RFC MFC 115 KINGMAN STREET LLC,Simple Business Checking,JEFFREY M. VIALL,,1689056.03,0.0,2025-02-10 00:00:00,17614.116667,11335.556,0.0,0,0,0.0,0.0,50.53,-41.47,0.0
2,3843,27052040,BAY STATE SEWAGE DISPOSAL INC,Business Checking,JEFFREY M. VIALL,,420203.99,0.0,2002-05-24 00:00:00,492039.61,521206.18,548577.24,0,1,426.18,-110.18,851.57,-183.79,0.85
3,3843,150452846,BAY STATE SEWAGE DISPOSAL INC,Business Checking,RICHARD J. CLARK,,180025.93,0.0,2020-04-15 00:00:00,38153.003333,27694.143333,20537.46,0,0,0.0,0.0,0.0,0.0,0.0
4,3843,27019306,BAY STATE SEWAGE DISPOSAL INC,Business Checking,JEFFREY M. VIALL,,75731.89,0.0,2014-10-24 00:00:00,127544.223333,118494.645,112118.07,0,0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13127,,,,,,,,,,,,,,,,,,,
13128,39665,29233828,ANAWAN PHARMACY LLC,Simple Business Checking,FRANK P. WILHELM,,27.15,0.0,2017-11-28 00:00:00,163.396667,246.8625,356.86,4,1,0.0,-8.0,0.0,-48.0,0.0
13129,39665,27102483,ANAWAN PHARMACY LLC,Simple Business Checking,FRANK P. WILHELM,,-729.61,0.0,2016-04-05 00:00:00,-24.116667,590.5075,1131.78,44,56,0.0,-8.0,500.0,-516.0,0.0
13130,39665,,ANAWAN PHARMACY LLC,,FRANK P. WILHELM,,-702.46,-0.0,,139.28,837.37,1488.64,48,57,0.0,-16.0,500.0,-564.0,0.0


In [35]:
formatted_data = formatted_data.rename(columns={
    'portfolio_key':'Portfolio Key',
    'product':'Product',
    '3Mo_AvgBal':'3Mo Avg Bal',
    'TTM_AvgBal':'TTM Avg Bal',
    'TTM_DAYS_OVERDRAWN':'TTM Days Overdrawn',
    'TTM_NSF_COUNT':'TTM NSF Count'
}).copy()

In [37]:


# %%
# formatted_data = src.core_transform.main_pipeline(merged_data)


# %%
# Create summary sheet

summary_data = formatted_data[~(formatted_data['Portfolio Key'] == "") & (formatted_data['Acct No.'] == "")].copy()
summary_data = summary_data[[
    'Portfolio Key',
    'Borrower Name',
    'Account Officer',
    'Cash Management Officer',
    'Current Balance',
    'Interest Rate',
    '3Mo Avg Bal',
    'TTM Avg Bal',
    'Year Ago Balance',
    'TTM Days Overdrawn',
    'TTM NSF Count',
    'Current Mo Analyzed Fees (Pre-ECR)',
    'Current Mo Net Analyzed Fees (Post-ECR)',
    'TTM Analyzed Fees (Pre-ECR)',
    'TTM Net Analyzed Fees (Post-ECR)',
    'Current ECR'
]].copy()



In [38]:
BASE_PATH = Path('.')

In [39]:
# %%
# Output to excel (raw data)
# BASE_PATH = Path('.')
OUTPUT_PATH = BASE_PATH / Path('./output/business_deposits_concentration_with_xaa.xlsx')
with pd.ExcelWriter(OUTPUT_PATH, engine="openpyxl") as writer:
    merged_data.to_excel(writer, sheet_name='Unformatted', index=False)
    formatted_data.to_excel(writer, sheet_name='Relationship Detail', index=False)
    summary_data.to_excel(writer, sheet_name='Relationship Summary', index=False)

In [40]:
"""
Output to Excel
"""

import win32com.client as win32 # type: ignore


def format_excel_file(file_path):
    excel = None
    workbook = None
    # Formatting
    try:
        excel = win32.Dispatch("Excel.Application")
        excel.Visible = False
        workbook = excel.Workbooks.Open(str(file_path.absolute()))

        unformatted = ['Unformatted']
        detail = ['Relationship Detail']
        summary = ['Relationship Summary']


        for sheet in workbook.Sheets:

            sheet.Columns.AutoFit()

            # Bold top row
            top_row = sheet.Rows(1)
            top_row.Font.Bold = True

            # Add bottom border to header row
            bottom_border = top_row.Borders(9)
            bottom_border.LineStyle = 1
            bottom_border.Weight = 2

            # Freeze top row
            sheet.Application.ActiveWindow.SplitRow = 1
            sheet.Application.ActiveWindow.FreezePanes = True
            
            if sheet.Name in unformatted:
                def format_columns():
                    sheet.Columns("K:K").NumberFormat = "mm/dd/yyyy"

                    sheet.Columns("I:I").NumberFormat = "0.00%"
                    
                    sheet.Columns("E:E").NumberFormat = "$#,##0.00"
                    sheet.Columns("F:F").NumberFormat = "$#,##0.00"
                    sheet.Columns("L:L").NumberFormat = "$#,##0.00"
                    sheet.Columns("M:M").NumberFormat = "$#,##0.00"
                    sheet.Columns("N:N").NumberFormat = "$#,##0.00"
                    sheet.Columns("O:O").NumberFormat = "$#,##0.00"
                    sheet.Columns("X:X").NumberFormat = "$#,##0.00"
                    sheet.Columns("Y:Y").NumberFormat = "$#,##0.00"
                    sheet.Columns("Z:Z").NumberFormat = "$#,##0.00"
                    sheet.Columns("AA:AA").NumberFormat = "$#,##0.00"


                format_columns()
            
            if sheet.Name in detail:
                def format_columns():
                    sheet.Columns("I:I").NumberFormat = "mm/dd/yyyy"

                    sheet.Columns("H:H").NumberFormat = "0.00%"
                    
                    sheet.Columns("G:G").NumberFormat = "$#,##0.00"
                    sheet.Columns("J:J").NumberFormat = "$#,##0.00"
                    sheet.Columns("K:K").NumberFormat = "$#,##0.00"
                    sheet.Columns("L:L").NumberFormat = "$#,##0.00"
                    sheet.Columns("O:R").NumberFormat = "$#,##0.00"


                format_columns()

            if sheet.Name in summary:
                def format_columns():
                    # sheet.Columns("K:K").NumberFormat = "mm/dd/yyyy"

                    sheet.Columns("F:F").NumberFormat = "0.00%"
                    
                    sheet.Columns("E:E").NumberFormat = "$#,##0.00"
                    sheet.Columns("G:I").NumberFormat = "$#,##0.00"
                    sheet.Columns("L:O").NumberFormat = "$#,##0.00"

                format_columns()

        workbook.Save()

        # print(f"Excel file saved with autofit at {file_path}")
    except Exception as e:
        print(f"Error: {str(e)}")
    finally:
        if workbook is not None:
            workbook.Close(SaveChanges=False)
        if excel is not None:
            excel.Quit()

In [41]:


# Format excel
format_excel_file(OUTPUT_PATH)

# if __name__ == '__main__':
# print(f"Starting [{__version__}]")
# # main(production_flag=True)
# main()
# print("Complete!")

