In [None]:
import pandas as pd
from pathlib import Path

In [None]:
INPUT_PATH = Path(r"C:\Users\w322800\Documents\gh\bcsb-prod\Reports\Indirect Lending\Adhoc_Pricing_Disparity_20250822\output\output.parquet")

In [None]:
df = pd.read_parquet(INPUT_PATH)

In [None]:
df

In [None]:
df.info()

In [None]:
df = df.drop(columns={
    'Loan Paid or Open',
    'Date Closed'
}).copy()

In [None]:
from deltalake import DeltaTable

In [None]:
ACCOUNT_DATA = Path(r"C:\Users\w322800\Documents\lakehouse\silver\account")

In [None]:
active_accounts = DeltaTable(ACCOUNT_DATA).to_pandas()

In [None]:
active_accounts

In [None]:
active_accounts = active_accounts[(active_accounts['Category'].isin(['Indirect'])) | (active_accounts['currmiaccttypcd']).isin(['CM15','CM16'])].copy()

In [None]:
df.info()

In [None]:
active_accounts_slice = active_accounts[[
    'acctnbr',
    'ownersortname',
    'contractdate',
    'curracctstatcd',
    'noteopenamt',
    'notebal',
    'noteintrate'
]].copy()

In [None]:
active_accounts_slice = active_accounts_slice.rename(columns={
    'acctnbr': 'Account Number',
    'contractdate': 'Loan Origination Date',
    'ownersortname': 'Applicant Last Name', # Needs splitting
    'noteopenamt': 'Amount Financed',
    'notebal': 'Current Balance',
    'noteintrate': 'Contract Rate',
}).copy()

In [None]:
active_accounts_slice.info()

In [None]:
merged_df = pd.merge(df, active_accounts_slice, on='Account Number', how='outer', suffixes=('_df','_active'), indicator=True)

In [None]:
merged_df

In [None]:
merged_df.info()

In [None]:
merged_df

In [None]:
import numpy as np

In [None]:
# Your Status column is perfect. It correctly identifies loans present
# in the active_accounts table as 'Active'.
merged_df['Status'] = np.where(merged_df['_merge'] == 'left_only', 'Closed','Active')
# Note charged off are included in closed (as they are no longer active accounts)

merged_df['Contract Rate'] = merged_df['Contract Rate_df']

merged_df['Current Rate'] = np.where(
    merged_df['Status'] == 'Active',
    merged_df['Contract Rate_active'],
    np.nan
)

merged_df['Current Balance'] = np.where(
    merged_df['Status'] == 'Active',
    merged_df['Current Balance_active'],
    0
)

# %%
# Define the base names of the columns that were duplicated during the merge
cols_to_consolidate = [
    'Loan Origination Date',
    'Applicant Last Name',
    'Amount Financed',
    # 'Current Balance',
    # 'Contract Rate'
]

# Consolidate the columns using combine_first
# This method is ideal for this scenario. It takes values from the first
# DataFrame (`_active` columns) and fills any missing (NaN) values with
# data from the second DataFrame (`_df` columns).
# This perfectly matches your logic: "If it's active, use the active data...
# otherwise it should be the left dataframe."
for col in cols_to_consolidate:
    active_col = f'{col}_active'
    df_col = f'{col}_df'
    merged_df[col] = merged_df[active_col].combine_first(merged_df[df_col])

# %%
# Create a list of the original suffixed columns to drop, plus the merge indicator
cols_to_drop = [f'{col}_{suffix}' for col in cols_to_consolidate for suffix in ['df', 'active']]
cols_to_drop.append('_merge')

# Create the final, clean DataFrame
final_df = merged_df.drop(columns=cols_to_drop)
final_df = final_df.drop(columns=['Current Balance_df','Current Balance_active','curracctstatcd','Contract Rate_df','Contract Rate_active']).copy()


# %%
# As you noted, the 'Applicant Last Name' came from 'ownersortname'
# and may need to be split if the format is 'LAST, FIRST'.
# final_df['Applicant Last Name'] = final_df['Applicant Last Name'].str.split(',').str[0].str.strip()

# %%
# final_df is now your fully consolidated DataFrame
final_df.info()

# %%
# Verify the status distribution
print("\nLoan Status Counts:")
print(final_df['Status'].value_counts())

# %%
# Display the head of the final DataFrame
final_df


In [None]:
from datetime import datetime

In [None]:
final_df = final_df[(final_df['Loan Origination Date'] <= datetime(2024, 12, 31)) & (final_df['Loan Origination Date'] >= datetime(2020, 1, 1))].copy()

In [None]:
final_df