In [None]:
import os
import sys
from pathlib import Path

# Navigate to project root (equivalent to cd ..)
project_dir = Path(__file__).parent.parent if '__file__' in globals() else Path.cwd().parent
os.chdir(project_dir)

# Add src directory to Python path for imports
src_dir = project_dir / "src"
if str(src_dir) not in sys.path:
    sys.path.insert(0, str(src_dir))

# Set environment for dev testing
os.environ['REPORT_ENV'] = 'dev'

In [None]:
import src.config

In [None]:
import pandas as pd
from deltalake import DeltaTable
from pathlib import Path

In [None]:
# df = DeltaTable(src.config.SILVER / "account").to_pandas()

In [None]:
import cdutils.acct_file_creation.core
from datetime import datetime

# # Specific date
specified_date = datetime(2022, 12, 31)
df = cdutils.acct_file_creation.core.query_df_on_date(specified_date)

In [None]:
df

In [None]:
df['mjaccttypcd'].unique()

In [None]:
df = df[df['mjaccttypcd'].isin(['CML','MLN','CNS','MTG','CK','SAV','TD'])].copy()

In [None]:
# Create Account Type mapping - Easier to understand, based on our major field
def map_account_type(acct_code:str):
    """
    Map mjaccttypcd to friendly Account Type
    """
    mapping = {
        'CML':'Loan',
        'MLN':'Loan',
        'CNS':'Loan',
        'MTG':'Loan',
        'CK':'Deposit',
        'SAV':'Deposit',
        'TD':'Deposit'
    }
    return mapping.get(str(acct_code).upper(), 'Other')

df['Account Type'] = df['mjaccttypcd'].apply(map_account_type)

In [None]:
prop = DeltaTable(src.config.SILVER / "property").to_pandas()

In [None]:
prop

In [None]:
prop = prop[[
    'propnbr',
    'aprsvalueamt',
    'propcity',
    'propstate',
    'addrnbr'
]].copy()

In [None]:
import numpy as np
prop['addrnbr'] = pd.to_numeric(prop['addrnbr'], errors='coerce')
prop['addrnbr'] = np.where(
    prop['addrnbr'].isna(),
    np.nan,
    prop['addrnbr'].astype('Int64').astype(str)
)

In [None]:
prop = prop.dropna(subset='addrnbr').copy()

In [None]:
prop

In [None]:
address = DeltaTable(src.config.SILVER / "address").to_pandas()


In [None]:
address = address[[
    'addrnbr',
    'zipcd'
]].copy()

In [None]:
merged_df = prop.merge(address, on='addrnbr', how='left').copy()

In [None]:
merged_df

In [None]:
acct_prop_link = DeltaTable(src.config.SILVER / "account_property_link").to_pandas()

In [None]:
acct_prop_link

In [None]:
acct_prop_link = acct_prop_link.drop(columns='load_timestamp_utc').copy()

In [None]:
merged_df = acct_prop_link.merge(merged_df, on='propnbr', how='left')

In [None]:
merged_df = merged_df[~merged_df['addrnbr'].isnull()].copy()
# sort descending appraisal value amount, drop duplicates on acctnbr
merged_df = merged_df.sort_values(by='aprsvalueamt', ascending=False).drop_duplicates(subset=['acctnbr'], keep='first').copy()

In [None]:
merged_df
assert merged_df['acctnbr'].is_unique, "Fail"

In [None]:
df = df.merge(merged_df, on='acctnbr', how='left').copy()

In [None]:
df

In [None]:
df['zipcd'] = np.where(df['zipcd'].isnull(), df['primaryownerzipcd'], df['zipcd'])

In [None]:

# Normalize ZIPs to 5 digits (handles ints and ZIP+4)
z = (
    df['zipcd']
      .astype(str)
      .str.extract(r'(\d{5})', expand=False)
      .str.zfill(5)
)

# --- Rhode Island: ALL ZIPs from unitedstateszipcodes.org/ri/ ---
ri_zips = {
    '02801','02802','02804','02806','02807','02808','02809','02812','02813','02814',
    '02815','02816','02817','02818','02822','02823','02824','02825','02826','02827',
    '02828','02829','02830','02831','02832','02833','02835','02836','02837','02838',
    '02839','02840','02841','02842','02852','02854','02857','02858','02859','02860',
    '02861','02862','02863','02864','02865','02871','02872','02873','02874','02875',
    '02876','02877','02878','02879','02880','02881','02882','02883','02885','02886',
    '02887','02888','02889','02891','02892','02893','02894','02895','02896','02898',
    '02901','02902','02903','02904','02905','02906','02907','02908','02909','02910',
    '02911','02912','02914','02915','02916','02917','02918','02919','02920','02921','02940',
}

# --- ALL Bristol County, MA ZIPs (from unitedstateszipcodes.org/ma/) ---
# Includes Standard, PO Box, and Unique.
bristol_all = {
    '02031','02048','02334'
    '02356','02357','02375',
    '02702','02703','02712','02714','02715','02717','02718','02719',
    '02720','02721','02722','02723','02724',
    '02725','02726',
    '02740','02741','02742','02743','02744','02745','02746','02747','02748',
    '02760','02761','02763','02764','02766','02767','02768','02769',
    '02771','02777','02779','02780','02783','02790','02791',
}

# --- South Coast subset of Bristol County ---
# Defined as East Freetown & Assonet and everything south of them: Fall River,
# New Bedford, Dartmouth, Fairhaven, Acushnet, Somerset, Swansea, Westport (+ PO Box/Unique).
bristol_south_coast = {
    # Freetown
    '02702','02717',
    # Fall River (incl. PO Box)
    '02720','02721','02722','02723','02724',
    # Somerset
    '02725','02726',
    # Swansea
    '02777',
    # New Bedford (incl. PO Boxes)
    '02740','02741','02742','02744','02745','02746',
    # Dartmouth (incl. North/South + PO Box)
    '02747','02748','02714',
    # Fairhaven
    '02719',
    # Acushnet (and overlap ZIP that also covers NB)
    '02743',
    # Westport (incl. Westport Point PO Box)
    '02790','02791',
}

# --- Attleboro/Taunton subset = remaining Bristol County ZIPs ---
bristol_attleboro_taunton = bristol_all - bristol_south_coast

# --- Build the mapping dict in priority order ---
zip_region_map = {
    **{z: 'Rhode Island'      for z in ri_zips},
    **{z: 'South Coast'       for z in bristol_south_coast},
    **{z: 'Attleboro/Taunton' for z in bristol_attleboro_taunton},
}

# Map; anything not in RI or Bristol County buckets → 'Other'
df['Region'] = z.map(zip_region_map).fillna('Other')

In [None]:
df

In [None]:
df = df[[
    'acctnbr',
    'ownersortname',
    'product',
    'Net Balance',
    'mjaccttypcd',
    'currmiaccttypcd',
    'loanofficer',
    'acctofficer',
    'curracctstatcd', # at that point in time
    'branchname',
    'Account Type',
    'zipcd',
    'Region'
]].copy()

In [None]:
# Performed 1 aggregation grouped on columns: 'Region', 'Account Type'
grouped_df = df.groupby(['Region', 'Account Type']).agg(NetBalance_sum=('Net Balance', 'sum')).reset_index()

In [None]:
grouped_df