In [None]:
import os
import sys
from pathlib import Path

# Navigate to project root (equivalent to cd ..)
project_dir = Path(__file__).parent.parent if '__file__' in globals() else Path.cwd().parent
os.chdir(project_dir)

# Add src directory to Python path for imports
src_dir = project_dir / "src"
if str(src_dir) not in sys.path:
    sys.path.insert(0, str(src_dir))

# Set environment for dev testing
os.environ['REPORT_ENV'] = 'dev'

In [None]:
import pandas as pd
import numpy as np

In [None]:
import cdutils.acct_file_creation.core
from datetime import datetime

In [None]:
df = cdutils.acct_file_creation.core.query_df_on_date()

In [None]:
df

In [None]:
# Fetch wh_org data
import src.smb_campaign.fetch_data
raw_data = src.smb_campaign.fetch_data.fetch_data()

In [None]:
import cdutils.deduplication

In [None]:
# Dedupe org table
if 'wh_org' in raw_data:
    dedupe_list = [{'df': raw_data['wh_org'], 'field': 'orgnbr'}]
    raw_data['wh_org'] = cdutils.deduplication.dedupe(dedupe_list)

In [None]:
wh_org = raw_data['wh_org'].copy()

In [None]:
assert wh_org['orgnbr'].is_unique, "Not unique"

In [None]:
wh_org

In [None]:
wh_org['orgtypcd'].unique()

In [None]:
filtered_org = wh_org[~(wh_org['orgtypcd'].isin(['BRCH','BANK']))].copy()

In [None]:
filtered_org.info()

In [None]:
filtered_org = filtered_org[[
    'orgnbr',
    'orgname',
    'orgtypcd',
    'orgtypcddesc'
]].copy()

In [None]:
# Aggregate stats (total loans/deposits) per orgnbr
## orgs only
acct_orgs = df[df['taxrptforpersnbr'].isna()].copy()

In [None]:
import numpy as np

In [None]:
## loans/deposit categorization
# Account type mappings
ACCOUNT_TYPE_MAPPING = {
    'CML': 'Commercial Loan',
    'MLN': 'Commercial Loan',
    'CNS': 'Consumer Loan',
    'MTG': 'Residential Loan',
    'CK': 'Checking',
    'SAV': 'Savings',
    'TD': 'CD'
}

acct_orgs['Account Type'] = acct_orgs['mjaccttypcd'].map(ACCOUNT_TYPE_MAPPING)

In [None]:
acct_orgs = acct_orgs[~(acct_orgs['Account Type'].isna())].copy()

In [None]:
acct_orgs

In [None]:
MACRO_TYPE_MAPPING = {
    'CML': 'Loan',
    'MLN': 'Loan',
    'CNS': 'Loan',
    'MTG': 'Loan',
    'CK': 'Deposit',
    'SAV': 'Deposit',
    'TD': 'Deposit'
}

acct_orgs['Macro Account Type'] = acct_orgs['mjaccttypcd'].map(MACRO_TYPE_MAPPING)

In [None]:
# Get other entity details
entity_details = acct_orgs.groupby('taxrptfororgnbr').agg(
    primaryownercity=('primaryownercity', 'first'),
    primaryownerstate=('primaryownerstate','first'),
    earliest_opendate=('contractdate','min')
).reset_index()

In [None]:
entity_details

In [None]:
entity_details['taxrptfororgnbr'] = entity_details['taxrptfororgnbr'].astype(int).astype(str)

In [None]:
filtered_org['orgnbr'] = filtered_org['orgnbr'].astype(str)

In [None]:
merged_df = pd.merge(filtered_org, entity_details, left_on='orgnbr', right_on='taxrptfororgnbr', how='inner')

In [None]:
merged_df = merged_df.drop(columns=['taxrptfororgnbr']).copy()

In [None]:
merged_df

In [None]:
# Need to get address information
# ORGADDRUSE
# WH_ADDR

In [None]:
wh_addr = raw_data['wh_addr'].copy()
orgaddruse = raw_data['orgaddruse'].copy()

In [None]:
wh_addr['addrlinetypdesc1'].unique()

In [55]:
def create_full_street_address(df):
    """
    Processes a list of raw address records and returns a cleaned list.
    """

    STREET_TYPES = {
        'street', 'apartment number','building number', 'suite number', 'room number'
    }
    POBOX_TYPE = 'post office box number'

    # Step A: Extract both street parts AND po box parts into temporary columns
    for i in [1, 2, 3]:
        text_col = f'text{i}'
        type_col = f'addrlinetypdesc{i}'
        
        # Condition for street parts
        is_street_part = df[type_col].str.lower().isin(STREET_TYPES).fillna(False)
        df[f'street_part{i}'] = df[text_col].where(is_street_part)
        
        # Condition for PO Box parts
        is_pobox_part = (df[type_col].str.lower() == POBOX_TYPE).fillna(False)
        df[f'pobox_part{i}'] = df[text_col].where(is_pobox_part)


    # Step B: Combine the parts into two separate, complete address strings
    street_parts = ['street_part1', 'street_part2', 'street_part3']
    pobox_parts = ['pobox_part1', 'pobox_part2', 'pobox_part3']

    df['combined_street'] = df[street_parts].apply(
        lambda row: ' '.join(row.dropna().astype(str)), axis=1
    )
    df['combined_pobox'] = df[pobox_parts].apply(
        lambda row: ' '.join(row.dropna().astype(str)), axis=1
    )

    # Step C: Apply the final rule: Use Street, but if it's empty, use PO Box.
    # First, replace empty strings '' in the street column with NaN so .fillna() works
    df['combined_street'] = df['combined_street'].replace('', np.nan)

    # Now, use .fillna() to populate empty street addresses with the po box value
    df['Full_Street_Address'] = df['combined_street'].fillna(df['combined_pobox'])


    # --- 3. Finalizing the Extract ---

    # Create the final, clean DataFrame with user-friendly column names
    df_clean = df[[
        'addrnbr',
        'Full_Street_Address',
        'cityname',
        'statecd',
        'zipcd'
    ]].copy()

    return df_clean 

In [56]:
cleaned_addr = create_full_street_address(wh_addr)

In [57]:
cleaned_addr

Unnamed: 0,addrnbr,Full_Street_Address,cityname,statecd,zipcd
0,1012690,178 INDIAN POND RD,KINGSTON,MA,02364
1,1012691,63 SCITUATE AVE,SCITUATE,MA,02066
2,1012692,287 RIVERSIDE ST,PORTSMOUTH,RI,02871
3,1012693,2122 ARROWGRASS DR # 101,WESLEY CHAPEL,FL,33544
4,1012694,13 SCHOOL HOUSE RD,PRESTON,CT,06365
...,...,...,...,...,...
378063,1428043,212 PINE GROVE ST,NEW BEDFORD,MA,02745
378064,1428187,1 LAKE ST,DRACUT,MA,01826
378065,1428188,,EMAIL,,
378066,1428189,1 LAKE ST,DRACUT,MA,01826


In [58]:
orgaddruse

Unnamed: 0,addrnbr,addrusecd,orgnbr
0,193,PRI,15
1,1,PRI,135
2,32,PRI,143
3,33,PRI,142
4,133,PRI,10
...,...,...,...
34215,1268422,PRI,1009042
34216,1268500,BUS,1001968
34217,1268572,BUS,1000119
34218,1268657,BUS,1001940


In [59]:
orgaddruse = orgaddruse[orgaddruse['addrusecd'].isin(['PRI'])].copy()

In [60]:
orgaddruse['orgnbr'] = orgaddruse['orgnbr'].astype(str)
orgaddruse['addrnbr'] = orgaddruse['addrnbr'].astype(str)

cleaned_addr['addrnbr'] = cleaned_addr['addrnbr'].astype(str)

In [61]:
merged_address = pd.merge(orgaddruse, cleaned_addr, on='addrnbr', how='inner')

In [62]:
merged_address

Unnamed: 0,addrnbr,addrusecd,orgnbr,Full_Street_Address,cityname,statecd,zipcd
0,193,PRI,15,409 3RD ST SW,WASHINGTON,DC,20416
1,1,PRI,135,135 DARLING DRIVE,AVON,CT,06001
2,32,PRI,143,111 MAIN STREET,BURLINGTON,VT,05401
3,33,PRI,142,999 WEST STREET,ROCKY HILL,CT,06067
4,133,PRI,10,100 EXECUTIVE BLVD,SOUTHINGTON,CT,06489
...,...,...,...,...,...,...,...
16273,1261204,PRI,1008781,190 ZACHARY RD,MANCHESTER,NH,03109
16274,1266205,PRI,1008897,91 GEORGE LEVEN DR.,NORTH ATTLEBORO,MA,02760
16275,1266256,PRI,1007459,650 DEXTER ST,CENTRAL FALLS,RI,02863
16276,1267040,PRI,1008950,207 SLOCUM RD,NORTH DARTMOUTH,MA,02747


In [63]:
wh_addr

Unnamed: 0,addrnbr,text1,addrlinetypcd1,addrlinetypdesc1,text2,addrlinetypcd2,addrlinetypdesc2,text3,addrlinetypcd3,addrlinetypdesc3,...,zipcd,street_part1,pobox_part1,street_part2,pobox_part2,street_part3,pobox_part3,combined_street,combined_pobox,Full_Street_Address
0,1012690,178 INDIAN POND RD,ST,Street,,,,,,,...,02364,178 INDIAN POND RD,,,,,,178 INDIAN POND RD,,178 INDIAN POND RD
1,1012691,63 SCITUATE AVE,ST,Street,,,,,,,...,02066,63 SCITUATE AVE,,,,,,63 SCITUATE AVE,,63 SCITUATE AVE
2,1012692,287 RIVERSIDE ST,ST,Street,,,,,,,...,02871,287 RIVERSIDE ST,,,,,,287 RIVERSIDE ST,,287 RIVERSIDE ST
3,1012693,2122 ARROWGRASS DR # 101,ST,Street,,,,,,,...,33544,2122 ARROWGRASS DR # 101,,,,,,2122 ARROWGRASS DR # 101,,2122 ARROWGRASS DR # 101
4,1012694,13 SCHOOL HOUSE RD,ST,Street,,,,,,,...,06365,13 SCHOOL HOUSE RD,,,,,,13 SCHOOL HOUSE RD,,13 SCHOOL HOUSE RD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378063,1428043,212 PINE GROVE ST,ST,Street,,,,,,,...,02745,212 PINE GROVE ST,,,,,,212 PINE GROVE ST,,212 PINE GROVE ST
378064,1428187,1 LAKE ST,ST,Street,,,,,,,...,01826,1 LAKE ST,,,,,,1 LAKE ST,,1 LAKE ST
378065,1428188,WAYNE@ATANDT.COM,ATTN,Attention,,,,,,,...,,,,,,,,,,
378066,1428189,1 LAKE ST,ST,Street,,,,,,,...,01826,1 LAKE ST,,,,,,1 LAKE ST,,1 LAKE ST


In [64]:
merged_final = pd.merge(merged_df, merged_address, on='orgnbr', how='left')

In [76]:
merged_final = merged_final[[
    'orgname',
    'orgtypcddesc',
    'earliest_opendate',
    'Full_Street_Address',
    'cityname',
    'statecd',
    'zipcd'
]].copy()

In [77]:
merged_final = merged_final.rename(columns={
    'orgname':'Organization Name',
    'orgtypcddesc':'Org Type',
    'earliest_opendate':'Earliest Open Date',
    'Full_Street_Address':'Full Street Address',
    'cityname':'City',
    'statecd':'State',
    'zipcd':'Zip'
}).copy()

In [78]:
OUTPUT_PATH = Path("./output/bkm_suppresion_list.parquet")
merged_final.to_parquet(OUTPUT_PATH, index=False)

In [79]:
merged_final

Unnamed: 0,Organization Name,Org Type,Earliest Open Date,Full Street Address,City,State,Zip
0,O. E. S. OF MASS HOPE CHAPTER #41,Association / Society,2008-08-08,32 WOODBINE ST,PAWTUCKET,RI,02860
1,THE MARY FERRARA IRREVOCABLE TRUST,Trust/Fiduciary,2008-11-13,72 VERNDALE AVE,ATTLEBORO,MA,02703
2,THREE T ENTERPRISES INC,Corporation/Business,2005-02-24,363 N MAIN ST,MANSFIELD,MA,02048
3,BRISTOL PLACE INC,Corporation/Business,2008-12-03,555 PLEASANT ST SUITE 201,ATTLEBORO,MA,02703
4,27-29 PEARL STREET CONDOMINIUM TRUST,Real Estate Investment Trust,2008-10-28,27 PEARL ST UNIT 2,ATTLEBORO,MA,02703
...,...,...,...,...,...,...,...
6217,"LOCKHART,D., LLC",Limited Liability Corporation,2023-09-25,424 THAMES STREET,NEWPORT,RI,02840
6218,"OREFICE, CALIRI & FERRI CPAS,LLC",Limited Liability Corporation,2024-03-06,5 LONGMEADOW DR,EAST GREENWICH,RI,02818
6219,PPE FRANCHISE LLC,Limited Liability Corporation,2024-03-07,422 WEST GROVE ST,MIDDLEBORO,MA,02346
6220,119 SCHOOL STREET REALTY NOMINEE TRUST,Trust/Fiduciary,2024-03-08,22 MARK DRIVE,LINCOLN,RI,02865


In [None]:
# check = merged_final.copy()

In [None]:
# import numpy as np

In [None]:
# check['test'] = np.where(check['primaryownercity'] != check['cityname'], 1, 0)

In [None]:
# Will turn into formal pipeline after getting feedback from business line
