In [30]:
"""

SUPPLEMENTARY CODE FOR BOE SWP 848:
Credit Growth, the Yield Curve and Financial Crisis Prediction: Evidence from a Machine Learning Approach

"""

import numpy as np
import hashlib

class Config:
    """ Creates a config object that specifies how the data is processed and how the experiment is run.
        The default values assigned here can be altered by the user in the experiment files (see experiments folder)
     """


    def __init__(self):

         # Path of R is needed as the decision tree is trained in R.
        self.r_path = None # e.g. 'C:\\Program Files\\R\\R-3.5.1\\bin\\x64\\Rscript'

        #### The following parameters determine how the data is processed ####
        self.data_horizon = 2  # Horizon of percentage and ratio changes (in years)

        self.data_period = 'all'  # The time frame investigate. Either 'all' observations,
        # or 'pre-ww2' or 'post-ww2'.
        self.crisis_definition = "JST" # Selects the crisis definition JST, BR (Baron) , RR (Reinhart–Rogoff)

        self.data_exclude_extreme_period = True  # Whether to exclude WW1, WW2 and
        # the Great Depression
        self.data_include_crisis_year = False  # Whether to exclude the actual crisis
        # observation and only predict years a head of a crisis
        self.data_crisis_lag = 2  # number of years before a crisis for
        self.data_crisis_single_lag = False # If FALSE: when crisis is j, then all years 1,2,3,..,j is set to positive label). This is our standard approach.
        # which outcome is set positive
        self.data_post_crisis = 4  #  How many observations (in years) after the
        # crisis should be deleted to avoid post-crisis bias

        self.clustered_crises = "" #  If set to 'remove' clustered crises are removed from the sample, if set to 'only' clustered crises are the only crises predicted
        # clustered crises are: 1907,1908,1930, 1931, 2007, 2008


        #### The following parameters determine experimental details ####

        self.exp_n_kernels = 1  # The number of kernels of the CPU used in parallel
        self.exp_nfolds = 5  # Number of folds in the cross-validation experiment.

        self.exp_algos = ['extree', "log"]  # list of algorithms that are tested in the experiment
        self.n_trees = 1000 # relevant for the random forest and extreme trees
        self.exp_year_split = None  # If 'None' the cross-validation experiment is run.
        # If it is a year y all instances up to that year are used for training and
        # the following observations for testing the model. The latter option is used for forecasting

        self.exp_id = "crisis"
        # This variable specifies constraints for the cross-validation
        # 'no': no constraint used
        # 'crisis': the observation of a crisis (by default 1-2 years before crisis)
        #   are assigned to the same fold
        # 'year': all observations of a certain year are assigned to the same fold
        # 'year_and_crisis' combination of the two constraints above

        # Hyperparameter search
        self.exp_verbose = 0  # Determines how verbose the output of the hyperparameter search is.
        self.exp_hyper_folds = 5  # Number of folds in the cross-validation of the hyperparameters
        self.exp_rep_cv = 1  # How often the cross-validation of the hyperparameters is repeated.
        self.exp_search = "grid"  # Either we use full 'grid' search or 'random' search
        self.exp_n_iter_rsearch = 250  # How many hyperparamter combinations are tested in the random search
        self.exp_optimization_metric = 'roc_auc'  # Metric that is optimized in the hyperparameter search

        # Shapley
        self.exp_do_shapley = True  # Whether Shapley values are computed
        self.exp_shap_background = 50  # Number of background samples used by the Shapley Kernel explainer
        self.exp_shapley_interaction = False  # Whether interactions of Shapley values are computed



        self.exp_taylor_k = 3 # lever of taylor expansion for interaction
        self.exp_taylor_samples = 10000 # maximum number of coalitions sampled used when estimating Shapley values using the Taylor index
        self.exp_taylor_feature_select = None # selected feature for which we want to compute Shapley values.
        # If None, all features are selected, which is computationally demanding. If smaller number of features is selected,
        # The remaining features are joinly treated as "others".

        self.exp_error_costs = "0.5"  # cost associated with the false positive
        # and false negative error. If set to '0.5', both errors are treated as equally important
        #  If set to 'balanced' the error of the minority classes is upweighted
        # such that the product of the error-weight and the proportion of objects
        # in the class is equivalent for both classes.

        # The error costs can also be set to arbitrary values using a dictionary,
        # e.g. {0: 0.1, 1: 0.9}. This means that the error in the positive class
        # are 9 times more important than the error in the negative class.

        self.exp_do_upsample = False  # whether the minority class is upsampled
        # according to the error costs (see above). If False, the objects are weighted
        # according to the error costs. Note that the weighting of objects is not
        # supported by all algorithms.

        self.exp_bootstrap = "no" # bootstrapping the training set with the options
        # no (no bootstrappoing), up (upsampling), down (downsampling)
        self.exp_bootstrap_replace = "no" # # whether to resample the minority class by replacement as well


    def _make_name(self, name_appx=""):
        """Creates a descriptive name according to the configuration.
        This name is used when saving the files in the results folder.
        It is based on some of the experiments parameters but the user
        can also add a suffix to the name with the name_appx argument
        """
        if self.exp_year_split is None:
            expName = ""
        else:
            expName = "year" + str(int(self.exp_year_split))
        name = str(expName)
        if name_appx != "":
            name = name + "_" + str(name_appx)

        if name == "":
            return name
        else:
            return "_" + name

In [43]:
import pandas as pd
import numpy as np

# 1. JST 데이터 불러오기
from google.colab import files
uploaded = files.upload()

jst_df = pd.read_csv("JSTdatasetR3.CSV")

# 2. 변수 생성
jst_df.sort_values(by=['country', 'year'], inplace=True)
jst_df['yield_curve_slope'] = jst_df['ltrate'] - jst_df['stir']
jst_df['credit_gdp'] = jst_df['tloans'] / jst_df['gdp']
jst_df['credit_gdp_2y_diff'] = jst_df.groupby('country')['credit_gdp'].transform(lambda x: x.diff(2) * 100)
jst_df['cpi_2y_growth'] = jst_df.groupby('country')['cpi'].transform(lambda x: (x / x.shift(2) - 1) * 100)
jst_df['debt_service_ratio'] = jst_df['stir'] * jst_df['tloans'] / jst_df['gdp']
jst_df['debt_service_ratio'] = jst_df.groupby('country')['debt_service_ratio'].transform(lambda x: x.diff(2) * 100)
jst_df['consumption_growth'] = jst_df.groupby('country')['rconpc'].transform(lambda x: (x / x.shift(2) - 1) * 100)
jst_df['investment_2y_diff'] = jst_df.groupby('country')['iy'].transform(lambda x: x.diff(2) * 100)
jst_df['public_debt_2y_diff'] = jst_df.groupby('country')['debtgdp'].transform(lambda x: x.diff(2) * 100)
jst_df['broad_money'] = jst_df['money'] / jst_df['gdp']
jst_df['broad_money_2y_diff'] = jst_df.groupby('country')['broad_money'].transform(lambda x: x.diff(2) * 100)
jst_df['stock_price_2y_growth'] = jst_df.groupby('country')['stocks'].transform(lambda x: (x / x.shift(2) - 1) * 100)
jst_df['current_account'] = jst_df['ca'] / jst_df['gdp']
jst_df['current_account_2y_diff'] = jst_df.groupby('country')['current_account'].transform(lambda x: x.diff(2) * 100)

# 글로벌 변수 생성
jst_df['global_yield_curve_slope'] = jst_df.groupby('year')['yield_curve_slope'].transform(lambda x: (x.sum() - x) / (x.count() - 1))
jst_df['global_credit_gdp_2y_diff'] = jst_df.groupby('year')['credit_gdp_2y_diff'].transform(lambda x: (x.sum() - x) / (x.count() - 1))

# 3. configure.py로부터 설정 불러오기
config = Config()

# 4. 위기 정의와 타겟 생성
from google.colab import files
uploaded = files.upload()
crises_def = pd.read_csv("crises_definitions.csv")
crises_def = crises_def[['iso', 'year', 'crisis' + config.crisis_definition]]
crises_def = crises_def.rename(columns={'crisis' + config.crisis_definition: 'crisis'})
crises_def = crises_def[crises_def['crisis'] == 1]

label_rows = []
for _, row in crises_def.iterrows():
    for lag in range(1, config.data_crisis_lag + 1):
        label_rows.append({'iso': row['iso'], 'year': row['year'] - lag, 'crisis': 1})

label_df = pd.DataFrame(label_rows)

# 4-2. 국가별 post-crisis 라벨링
remove_rows = []
for _, row in crises_def.iterrows():
    for k in range(config.data_post_crisis + 1):
        remove_rows.append({'iso': row['iso'], 'year': row['year'] + k, 'remove': 1})

remove_df = pd.DataFrame(remove_rows)

# 병합 및 누락값 처리
label_df = label_df.merge(remove_df, on=['iso', 'year'], how='left')
label_df['crisis'] = label_df['crisis'].fillna(0).astype(int)
label_df['remove'] = label_df['remove'].fillna(0).astype(int)

# JST 데이터와 병합
jst_df = jst_df.merge(label_df, on=['iso', 'year'], how='left')
jst_df['crisis'] = jst_df['crisis'].fillna(0).astype(int)
jst_df['remove'] = jst_df['remove'].fillna(0).astype(int)

# 타겟 정의
jst_df['target'] = ((jst_df['crisis'] == 1) & (jst_df['remove'] == 0)).astype(int)

# 5. 최종 저장
final_columns = [
    'country', 'iso', 'year', 'target',
    'yield_curve_slope', 'credit_gdp_2y_diff', 'cpi_2y_growth',
    'debt_service_ratio', 'consumption_growth', 'investment_2y_diff',
    'public_debt_2y_diff', 'broad_money_2y_diff', 'stock_price_2y_growth',
    'current_account_2y_diff', 'global_yield_curve_slope', 'global_credit_gdp_2y_diff'
]

final_data = jst_df[final_columns]
final_data.to_csv("final_crisis_dataset_configured.csv", index=False)


Saving JSTdatasetR3.CSV to JSTdatasetR3 (12).CSV


Saving crises_definitions.csv to crises_definitions (9).csv


In [53]:
from google.colab import files
files.download("final_crisis_dataset_configured.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [54]:
# 추가: 결측치가 하나라도 있는 row 제거 후 저장
final_data_clean = final_data.dropna(subset=final_columns)
final_data_clean.to_csv("final_crisis_dataset_clean.csv", index=False)

In [55]:
from google.colab import files
files.download("final_crisis_dataset_clean.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [56]:
# 추가2: 1912-1918. 1931-1945 데이터 제외
# 제외할 연도 범위 설정
exclude_years = list(range(1912, 1919)) + list(range(1931, 1946))

# 제외 연도 제거
final_data_clean_period = final_data_clean[~final_data_clean['year'].isin(exclude_years)]

# 저장
final_data_clean_period.to_csv("final_crisis_dataset_clean_filtered.csv", index=False)

In [57]:
from google.colab import files
files.download("final_crisis_dataset_clean_filtered.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [45]:
print(crises_def.columns)

Index(['iso', 'year', 'crisis'], dtype='object')


In [44]:
jst_df[jst_df['target'] == 1]

Unnamed: 0,year,country,iso,ifs,pop,rgdpmad,rgdppc,rconpc,gdp,iy,...,broad_money,broad_money_2y_diff,stock_price_2y_growth,current_account,current_account_2y_diff,global_yield_curve_slope,global_credit_gdp_2y_diff,crisis,remove,target
21,1891,Australia,AUS,193,3196.0,4666.458073,19.912640,28.508848,540.300000,0.195449,...,0.441792,1.714531,-15.261324,-0.051325,2.562980,-0.194552,0.292970,1,0,1
22,1892,Australia,AUS,193,3274.0,3995.418448,16.979357,23.832012,495.700000,0.136706,...,0.485374,5.749030,-14.362519,-0.029669,4.847108,0.700349,1.427979,1,0,1
117,1987,Australia,AUS,193,16137.0,16293.301110,69.185337,64.846500,285733.000000,0.230164,...,0.518733,0.504608,31.388352,-0.036132,1.542622,0.860628,6.031609,1,0,1
118,1988,Australia,AUS,193,16400.0,16752.256100,70.833203,66.709500,324044.000000,0.239688,...,0.533894,2.790904,0.953053,-0.040681,1.199410,0.536268,6.716287,1,0,1
160,1883,Belgium,BEL,124,5740.0,3144.534146,11.612711,,5500.074884,,...,,,-16.806723,-0.037999,2.131757,-0.222006,,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2410,1928,USA,USA,111,120971.0,6569.345446,17.733510,19.211100,98.305000,0.147927,...,0.557856,3.755121,71.608599,0.009053,0.432342,0.292192,2.798429,1,0,1
2464,1982,USA,USA,111,232187.8,18325.120260,59.700116,55.329400,3345.000000,0.198098,...,0.547516,0.940266,4.419476,-0.003470,-0.421301,0.962755,1.878052,1,0,1
2465,1983,USA,USA,111,234307.2,18920.156390,61.849832,57.983400,3638.100000,0.196033,...,0.564796,4.172140,32.794830,-0.012155,-1.365322,0.867681,0.632292,1,0,1
2487,2005,USA,USA,111,295583.0,30841.645500,100.000000,97.836100,13093.700000,0.227682,...,0.496754,-2.016198,16.789125,-0.056930,-1.163805,1.105175,6.338154,1,0,1


In [46]:
print(config.crisis_definition)
# 예: 출력이 'JST' 또는 'BR'이어야 함


JST


In [51]:
jst_df[(jst_df['crisis'] == 1) & (jst_df['remove'] == 0)]

Unnamed: 0,year,country,iso,ifs,pop,rgdpmad,rgdppc,rconpc,gdp,iy,...,broad_money,broad_money_2y_diff,stock_price_2y_growth,current_account,current_account_2y_diff,global_yield_curve_slope,global_credit_gdp_2y_diff,crisis,remove,target
21,1891,Australia,AUS,193,3196.0,4666.458073,19.912640,28.508848,540.300000,0.195449,...,0.441792,1.714531,-15.261324,-0.051325,2.562980,-0.194552,0.292970,1,0,1
22,1892,Australia,AUS,193,3274.0,3995.418448,16.979357,23.832012,495.700000,0.136706,...,0.485374,5.749030,-14.362519,-0.029669,4.847108,0.700349,1.427979,1,0,1
117,1987,Australia,AUS,193,16137.0,16293.301110,69.185337,64.846500,285733.000000,0.230164,...,0.518733,0.504608,31.388352,-0.036132,1.542622,0.860628,6.031609,1,0,1
118,1988,Australia,AUS,193,16400.0,16752.256100,70.833203,66.709500,324044.000000,0.239688,...,0.533894,2.790904,0.953053,-0.040681,1.199410,0.536268,6.716287,1,0,1
160,1883,Belgium,BEL,124,5740.0,3144.534146,11.612711,,5500.074884,,...,,,-16.806723,-0.037999,2.131757,-0.222006,,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2410,1928,USA,USA,111,120971.0,6569.345446,17.733510,19.211100,98.305000,0.147927,...,0.557856,3.755121,71.608599,0.009053,0.432342,0.292192,2.798429,1,0,1
2464,1982,USA,USA,111,232187.8,18325.120260,59.700116,55.329400,3345.000000,0.198098,...,0.547516,0.940266,4.419476,-0.003470,-0.421301,0.962755,1.878052,1,0,1
2465,1983,USA,USA,111,234307.2,18920.156390,61.849832,57.983400,3638.100000,0.196033,...,0.564796,4.172140,32.794830,-0.012155,-1.365322,0.867681,0.632292,1,0,1
2487,2005,USA,USA,111,295583.0,30841.645500,100.000000,97.836100,13093.700000,0.227682,...,0.496754,-2.016198,16.789125,-0.056930,-1.163805,1.105175,6.338154,1,0,1


In [48]:
# 위기 정의별 분포 확인
crisis_col = "crisis" + config.crisis_definition
crises_raw = pd.read_csv("crises_definitions.csv")
print(crises_raw[crisis_col].value_counts(dropna=False))

# label_df에 위기 레이블 생성된 연도 확인
print(label_df[label_df['crisis'] == 1].sort_values('year').head())

# jst_df 병합 후 결과 확인
print(jst_df['target'].value_counts())


crisisJST
0    2409
1      90
Name: count, dtype: int64
   iso  year  crisis  remove
1  BEL  1868       1       0
3  CHE  1868       1       0
0  BEL  1869       1       0
2  CHE  1869       1       0
5  JPN  1869       1       0
target
0    2337
1     162
Name: count, dtype: int64


In [49]:
print(crises_def['year'].unique())

[1870 1871 1873 1877 1878 1882 1883 1885 1887 1889 1890 1891 1893 1899
 1900 1901 1907 1908 1910 1913 1920 1921 1922 1923 1924 1925 1927 1929
 1930 1931 1934 1935 1939 1974 1977 1984 1987 1988 1989 1990 1991 1997
 2007 2008]
