# Configure

In [22]:
import pandas as pd
import numpy as np


# --- Configuration ---
do_winsorize = False # Assuming this might be used later
lags = 1
reduce_banks = True
reduce_banks_to = 20

raw_variables = [
    'gdp_qoq', 'cpi_qoq', 'sp500_qoq', 'corp_bond_spread',
    'cons_sentiment_qoq', 'unemployment', 'household_delinq',
    'vix_qoq', 'spread_10y_3m',
    'total_assets', 'total_deposits', 'total_loans_and_leases',
    'trading_assets', 'net_interest_income', 'interest_income', 
    'non_interest_income', 'interest_expense' 'tbill_3m', 'tbill_10y'
]

feature_variables = ['gdp_qoq', 'cpi_qoq', 'cons_sentiment_qoq', 'unemployment', 
                     'household_delinq', 'tbill_3m', 'tbill_10y', 'spread_10y_3m', 
                     'deposits_to_assets', 'loans_to_assets']

target_variable = 'net_interest_income_to_assets'


# Load and select data

In [23]:

# --- Load Data ---
# Suppress warnings temporarily if needed, though fixing them is better
# warnings.filterwarnings('ignore', category=RuntimeWarning)
# warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)

fred = pd.read_parquet('data/fred/macro_data_processed.parquet')
fdic = pd.read_parquet('data/fdic/fdic_data_processed.parquet')
yahoo = pd.read_parquet('data/yahoo/yahoo.parquet')


data_selected = fdic.merge(fred, on='date', how='left').merge(yahoo, on='date', how='left')
data_selected.set_index(['id', 'date'], inplace=True)


# Transform

In [24]:

def calculate_financial_ratios(df):
    """
    Calculates various financial ratios and adds them as new columns to the DataFrame.

    Args:
        df (pd.DataFrame): DataFrame with necessary columns 
                           (e.g., total_deposits, total_assets, etc.).

    Returns:
        pd.DataFrame: DataFrame with added ratio columns.
    """
    df_processed = df.copy()

    # Calculate ratios
    df_processed['deposit_ratio'] = df_processed['total_deposits'] / df_processed['total_assets']
    df_processed['loan_to_deposit_ratio'] = df_processed['total_loans_and_leases'] / df_processed['total_deposits']
    df_processed['loan_to_asset_ratio'] = df_processed['total_loans_and_leases'] / df_processed['total_assets']
    df_processed['equity_to_asset_ratio'] = df_processed['total_equity'] / df_processed['total_assets']
    df_processed['trading_assets_ratio'] = df_processed['trading_assets'] / df_processed['total_assets']
    df_processed['net_interest_margin'] = (
        df_processed['interest_income'] - df_processed['interest_expense']
    ) / df_processed['total_assets']
    df_processed['roe'] = df_processed['net_income'] / df_processed['total_equity']
    df_processed['roa'] = df_processed['net_income'] / df_processed['total_assets']
    df_processed['net_interest_income_to_assets'] = df_processed['net_interest_income'] / df_processed['total_assets']
    df_processed[    'interest_income_to_assets'] = df_processed[    'interest_income'] / df_processed['total_assets']
    df_processed['non_interest_income_to_assets'] = df_processed['non_interest_income'] / df_processed['total_assets']
    df_processed[    'interest_expense_to_assets'] = df_processed[   'interest_expense']/ df_processed['total_assets']
    df_processed['npl_ratio'] = df_processed['npl'] / df_processed['total_loans_and_leases']
    df_processed['charge_off_ratio'] = df_processed['total_charge_offs'] / df_processed['total_loans_and_leases']
    df_processed['rwa_ratio'] = df_processed['total_rwa'] / df_processed['total_assets']

    # Log total assets
    # Replace 0 or negative values with a small positive number before taking log to avoid -inf or NaN
    df_processed['log_total_assets'] = np.log(df_processed['total_assets'].replace(0, np.nan).fillna(1e-9))


    # Handle potential division by zero or NaN results by replacing inf with NaN
    # and then optionally filling NaN with 0 or another appropriate value.
    # This is a general approach; specific ratios might need different handling.
    ratio_cols = [
        'deposit_ratio', 'loan_to_deposit_ratio', 'loan_to_asset_ratio',
        'equity_to_asset_ratio', 'trading_assets_ratio', 'net_interest_margin',
        'roe', 'roa', 'npl_ratio', 'charge_off_ratio', 'rwa_ratio', 'log_total_assets'
    ]
    for col in ratio_cols:
        if col in df_processed.columns: # Ensure column exists
            df_processed[col] = df_processed[col].replace([np.inf, -np.inf], np.nan)
            # Optionally, fill NaNs if appropriate for your analysis, e.g., with 0
            # df_processed[col] = df_processed[col].fillna(0)


    return df_processed

# Apply the function to your DataFrame
data_selected = calculate_financial_ratios(data_selected)

# Clean the data

In [25]:
data_cleaned = data_selected.copy()
initial_rows = len(data_cleaned)
cleaning_tracker = {}
print(f"Initial number of rows: {initial_rows}")

# Delete rows with zero total assets
rows_before = len(data_cleaned)
data_cleaned = data_cleaned[data_cleaned['log_total_assets'] > -np.inf]
rows_deleted = rows_before - len(data_cleaned)
cleaning_tracker['cleaning_ta_zero'] = rows_deleted
print(f"Deleted {rows_deleted} rows with zero or negative total assets.")

# Delete roa values smaller than -1 and greater than 1
rows_before = len(data_cleaned)
data_cleaned = data_cleaned[(data_cleaned['roa'] > -1) & (data_cleaned['roa'] < 1)]
rows_deleted = rows_before - len(data_cleaned)
cleaning_tracker['cleaning_roa_invalid'] = rows_deleted
print(f"Deleted {rows_deleted} rows with ROA < -1 or > 1.")

# Delete rows with loans to total assets ratio greater than or equal to 1
rows_before = len(data_cleaned)
data_cleaned = data_cleaned[data_cleaned['loan_to_asset_ratio'] < 1]
rows_deleted = rows_before - len(data_cleaned)
cleaning_tracker['cleaning_l2ta_ge_1'] = rows_deleted
print(f"Deleted {rows_deleted} rows with loans to total assets ratio >= 1.")

# Delete rows with equity to total assets ratio greater than or equal to 1
# rows_before = len(data_cleaned)
# data_cleaned = data_cleaned[data_cleaned['equity_to_asset_ratio'] <= 1]
# rows_deleted = rows_before - len(data_cleaned)
# cleaning_tracker['cleaning_eq2ta_ge_1'] = rows_deleted
# print(f"Deleted {rows_deleted} rows with equity to total assets ratio > 1.")

# Delete rows with trading assets to total assets ratio < 0 or >= 1
rows_before = len(data_cleaned)
data_cleaned = data_cleaned[(data_cleaned['trading_assets_ratio'] < 1) & (data_cleaned['trading_assets_ratio'] >= 0)]
rows_deleted = rows_before - len(data_cleaned)
cleaning_tracker['cleaning_trada_invalid'] = rows_deleted
print(f"Deleted {rows_deleted} rows with trading assets to total assets ratio < 0 or >= 1.")


total_deleted = sum(cleaning_tracker.values())
print(f"\nTotal rows deleted: {total_deleted}")
print(f"Remaining rows: {len(data_cleaned)}")

# Optional: Display the tracker dictionary
# print(f"Cleaning tracker: {cleaning_tracker}")

# Assign the cleaned data back to data_processed for subsequent steps
# Note: The next cell (CELL INDEX 6) uses data_processed.
# If this cell is intended to be the primary cleaning step,
# rename data_cleaned to data_processed here.
print(f"\nShape after cleaning: {data_cleaned.shape}")

Initial number of rows: 644542
Deleted 0 rows with zero or negative total assets.
Deleted 39 rows with ROA < -1 or > 1.
Deleted 137 rows with loans to total assets ratio >= 1.
Deleted 4 rows with trading assets to total assets ratio < 0 or >= 1.

Total rows deleted: 180
Remaining rows: 644362

Shape after cleaning: (644362, 68)


# Winsorize

In [26]:
def winsorize_dataframe(df, vars_to_winsorize, do_winsorize=False, lower_percentile=0.02, upper_percentile=0.98):
    """
    Winsorizes specified columns of a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        vars_to_winsorize (list): A list of column names to winsorize.
        do_winsorize (bool): If True, performs winsorization. Otherwise, returns a copy of the original DataFrame.
        lower_percentile (float): The lower percentile for clipping.
        upper_percentile (float): The upper percentile for clipping.

    Returns:
        pd.DataFrame: The winsorized (or original) DataFrame.
    """
    df_out = df.copy(deep=True)

    if not do_winsorize:
        return df_out

    def winsorize_series(series, lower_p, upper_p):
        lower_bound = series.quantile(lower_p)
        upper_bound = series.quantile(upper_p)
        return np.clip(series, lower_bound, upper_bound)

    print(f"Winsorizing columns at [{lower_percentile}, {upper_percentile}] percentiles...")
    for col in vars_to_winsorize:
        if col in df_out.columns:
            if pd.api.types.is_numeric_dtype(df_out[col]):
                print(f" - Winsorizing {col}")
                df_out[col] = winsorize_series(df_out[col], lower_percentile, upper_percentile)
            else:
                print(f" - Skipping non-numeric column {col}")
        else:
            print(f" - Skipping column {col} (not found in DataFrame)")
    print("Winsorization complete.")
    return df_out

# --- Apply Winsorization ---


# Define the columns to be winsorized in the features DataFrame.
# This is a simple heuristic based on column names. Adjust as needed.
accounting_columns = [col for col in data_cleaned.columns if 'ratio' in col or 'assets' in col or 'roa' in col]

# Winsorize features
data_winsorized = winsorize_dataframe(data_cleaned, accounting_columns, do_winsorize=do_winsorize, lower_percentile=0.02, upper_percentile=0.98)

print(f"Winsorized the following columns: {accounting_columns}")

Winsorized the following columns: ['total_assets', 'trading_assets', 'deposit_ratio', 'loan_to_deposit_ratio', 'loan_to_asset_ratio', 'equity_to_asset_ratio', 'trading_assets_ratio', 'roa', 'net_interest_income_to_assets', 'interest_income_to_assets', 'non_interest_income_to_assets', 'interest_expense_to_assets', 'npl_ratio', 'charge_off_ratio', 'rwa_ratio', 'log_total_assets']


# Lag the data

In [27]:
data_processed = data_winsorized.copy()


# Store original columns to iterate over (avoids lagging the lags)
original_cols = list(data_processed.columns)
lagged_features = [] # List to store the new lagged Series

print("Starting lag generation...")
# Loop through original columns
nr_generated_lags = 0
for col in original_cols:
     # Loop through desired lags
     for lag in range(1, lags + 1):
          # Calculate the lagged series
          # Use level='id' to group by the 'id' level of the MultiIndex
          data_processed[f'{col}_lag{lag}'] = data_processed.groupby(level='id')[col].shift(lag)
          nr_generated_lags += 1

print(f"Generated {nr_generated_lags} lagged features.")





Starting lag generation...
Generated 68 lagged features.


# Save the data

In [28]:

# Save the final data
data_processed.to_parquet("data.parquet")
