In [21]:
import pandas as pd
import numpy as np

In [22]:
CONFIG = {
    'country_col': 'country',
    'year_col': 'year',
    'crisis_col': 'crisisJST',
    'gdp_col': 'rgdpmad',

    'raw_cols': {
        'cpi': 'cpi',
        'money': 'money',
        'housing': 'hpnom',
        'credit': 'tloans',
        'gdp': 'gdp',
        'debtgdp': 'debtgdp',
        'ltrate': 'ltrate',
        'stir': 'stir',
    },

    
    'pre_peak_window': 5,
    'max_horizon': 15,
    'drop_unrecovered': False,

    
    'exclude_shock_years': True,

   
    'feature_cols': [
        'cpi_rate',
        'money_rate',
        'housing_rate',
        'credit_rate',
        'slope',
        'global_credit_rate',
        'global_slope',
        'debtgdp_level',
        'debtgdp_change',
    ]
}

In [23]:
def _check_columns(df, cols, context=''):
    missing = [c for c in cols if c not in df.columns]
    if missing:
        raise ValueError(
            f"Missing columns {context}: {missing}\n"
            f"Available columns (first 60): {list(df.columns)[:60]}" + (" ..." if len(df.columns) > 60 else "")
        )


def basic_panel_clean(df, cfg=CONFIG):
    c = cfg['country_col']
    y = cfg['year_col']
    cr = cfg['crisis_col']
    _check_columns(df, [c, y, cr], 'basic_panel_clean')

    out = df.copy()
    out[y] = pd.to_numeric(out[y], errors='coerce')
    out = out.dropna(subset=[c, y]).copy()
    out[y] = out[y].astype(int)

    out[cr] = pd.to_numeric(out[cr], errors='coerce').fillna(0).astype(int)
    out = out.sort_values([c, y]).reset_index(drop=True)

    # Removing WWI and the Great Depression + WW2 1933â€“1945 
    if cfg.get('exclude_shock_years', True):
        out = out[
            ((out[y] < 1914) | (out[y] > 1918)) &
            ((out[y] < 1933) | (out[y] > 1945))
        ].copy()

    return out


def add_engineered_features(df, cfg=CONFIG):
    c = cfg['country_col']
    y = cfg['year_col']
    raw = cfg['raw_cols']
    _check_columns(df, [c, y] + list(raw.values()), 'add_engineered_features')

    def _by_country(g):
        g = g.sort_values(y).copy()

        # (t - t-1)/(t-1): inflation, money growth, housing growth
        g['cpi_rate'] = (g[raw['cpi']] - g[raw['cpi']].shift(1)) / g[raw['cpi']].shift(1)
        g['money_rate'] = (g[raw['money']] - g[raw['money']].shift(1)) / g[raw['money']].shift(1)
        g['housing_rate'] = (g[raw['housing']] - g[raw['housing']].shift(1)) / g[raw['housing']].shift(1)

        g['credit_rate'] = (g[raw['credit']] / g[raw['gdp']]) - (g[raw['credit']].shift(1) / g[raw['gdp']].shift(1))

        # Yield curve slope
        g['slope'] = g[raw['ltrate']] - g[raw['stir']]

        # Debt-to-GDP
        g['debtgdp_level'] = g[raw['debtgdp']]
        g['debtgdp_change'] = g[raw['debtgdp']] - g[raw['debtgdp']].shift(1)
        return g

    out = df.groupby(c, group_keys=False).apply(_by_country)

    # global aggregates by year 
    out = out.groupby(y, group_keys=False).apply(
        lambda x: x.assign(
            global_credit_rate=x['credit_rate'].mean(skipna=True),
            global_slope=x['slope'].mean(skipna=True)
        )
    )
    return out


def find_crisis_onsets(df, cfg=CONFIG):
    c = cfg['country_col']
    y = cfg['year_col']
    cr = cfg['crisis_col']

    out = df.sort_values([c, y]).copy()
    out['_crisis_prev'] = out.groupby(c)[cr].shift(1).fillna(0).astype(int)
    out['_onset'] = (out[cr] == 1) & (out['_crisis_prev'] == 0)
    onsets = out.loc[out['_onset'], [c, y]].rename(columns={y: 'crisis_year'}).reset_index(drop=True)
    return onsets


def compute_recovery_time(country_df, crisis_year, gdp_col, year_col, pre_peak_window, max_horizon):
    # Pre-crisis peak GDP
    pre = country_df[(country_df[year_col] >= crisis_year - pre_peak_window) &
                     (country_df[year_col] <= crisis_year - 1)]
    if pre.empty or pre[gdp_col].dropna().empty:
        return np.nan
    pre_peak = float(pre[gdp_col].max())

    post = country_df[(country_df[year_col] >= crisis_year + 1) &
                      (country_df[year_col] <= crisis_year + max_horizon)][[year_col, gdp_col]].dropna()
    if post.empty:
        return np.nan

    recovered_years = post.loc[post[gdp_col] >= pre_peak, year_col]
    if recovered_years.empty:
        return float(max_horizon)  # cap

    return float(int(recovered_years.min()) - crisis_year)

In [24]:
def clean_data(
    df,
    training_columns=None,
    feature_means=None,
    is_training=True,
    verbose=False,
    cfg=None
):
    if cfg is None:
        cfg = CONFIG

    feature_cols = cfg['feature_cols']

    if is_training:

        panel = basic_panel_clean(df, cfg)
        panel = add_engineered_features(panel, cfg)

        onsets = find_crisis_onsets(panel, cfg)
        if verbose:
            print('Crisis found:', len(onsets))

        c = cfg['country_col']
        ycol = cfg['year_col']
        gdp_col = cfg['gdp_col']

        _check_columns(panel, [gdp_col], 'GDP column for recovery definition')

        episodes = []
        for _, row in onsets.iterrows():
            country = row[c]
            t0 = int(row['crisis_year'])
            cdf = panel[panel[c] == country]

            rt = compute_recovery_time(
                cdf,
                crisis_year=t0,
                gdp_col=gdp_col,
                year_col=ycol,
                pre_peak_window=int(cfg['pre_peak_window']),
                max_horizon=int(cfg['max_horizon'])
            )
            episodes.append({c: country, 'crisis_year': t0, 'recovery_time': rt})

        episodes = pd.DataFrame(episodes).dropna(subset=['recovery_time']).copy()
        if cfg.get('drop_unrecovered', False):
            episodes = episodes[
                episodes['recovery_time'] < float(cfg['max_horizon'])
            ].copy()

        panel_at_t0 = panel.rename(columns={ycol: 'crisis_year'})[
            [c, 'crisis_year'] + feature_cols
        ].copy()
        episodes = episodes.merge(panel_at_t0, on=[c, 'crisis_year'], how='left')

        X = episodes[feature_cols].copy()
        y = episodes['recovery_time'].copy()

        continuous_cols = feature_cols[:]
        means = X[continuous_cols].mean(numeric_only=True)
        X[continuous_cols] = X[continuous_cols].fillna(means)

        training_cols = X.columns.tolist()
        feature_means = means.to_dict()

        if verbose:
            print('Episode dataset rows:', len(episodes))
            print('Features:', training_cols)

        return X, y, training_cols, continuous_cols, feature_means

    else:
        _check_columns(df, feature_cols, 'prediction Excel/template')
        X = df[feature_cols].copy()

        if feature_means is not None:
            X = X.fillna(feature_means)
        else:
            X = X.fillna(X.mean(numeric_only=True))

        if training_columns is not None:
            X = X.reindex(columns=training_columns, fill_value=0)

        return X
