In [1]:
import numpy as np
import pandas as pd
import pickle

col_names = pd.read_csv('../data/dataset.csv', nrows=0).columns
dtype_map = {'class' : np.int16, 'bankruptcy_after_years' : np.int16}
dtype_map.update({col: np.float64 for col in col_names if col not in dtype_map})

df = pd.read_csv('../data/dataset.csv', dtype=dtype_map)
df = df.drop([df.columns[0], df.columns[1], df.columns[2], 'year'], axis=1)
df.drop_duplicates(keep=False, inplace=True)

df.head()

Unnamed: 0,Attr1,Attr2,Attr3,Attr4,Attr5,Attr6,Attr7,Attr8,Attr9,Attr10,...,Attr57,Attr58,Attr59,Attr60,Attr61,Attr62,Attr63,Attr64,class,bankruptcy_after_years
0,0.20055,0.37951,0.39641,2.0472,32.351,0.38825,0.24976,1.3305,1.1389,0.50494,...,0.39718,0.87804,0.001924,8.416,5.1372,82.658,4.4158,7.4277,0,0
1,0.20912,0.49988,0.47225,1.9447,14.786,0.0,0.25834,0.99601,1.6996,0.49788,...,0.42002,0.853,0.0,4.1486,3.2732,107.35,3.4,60.987,0,0
2,0.24866,0.69592,0.26713,1.5548,-1.1523,0.0,0.30906,0.43695,1.309,0.30408,...,0.81774,0.76599,0.69484,4.9909,3.951,134.27,2.7185,5.2078,0,0
3,0.081483,0.30734,0.45879,2.4928,51.952,0.14988,0.092704,1.8661,1.0571,0.57353,...,0.14207,0.94598,0.0,4.5746,3.6147,86.435,4.2228,5.5497,0,0
4,0.18732,0.61323,0.2296,1.4063,-7.3128,0.18732,0.18732,0.6307,1.1559,0.38677,...,0.48431,0.86515,0.12444,6.3985,4.3158,127.21,2.8692,7.898,0,0


Impute missing values before selecting top-10 features based on mutual information gain with nonparametrical model (kNN, k=3). Mutual information gain criteria is suitable for feature selection from the data having both linear and non-linear dependencies.

In [2]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.impute import SimpleImputer
import gc
import numpy as np

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_data = imp.fit_transform(df)
imp_data_df = pd.DataFrame(imp_data, index=df.index, columns=df.columns)

X = imp_data_df[imp_data_df.columns.difference(['bankruptcy_after_years', 'class'])]
Y = imp_data_df['bankruptcy_after_years']

del imp_data, imp_data_df
gc.collect()

0

In [3]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif

def select_k_best(score):
    selector = SelectKBest(score, k=10)
    X_new = selector.fit_transform(X, Y)
    names = X.columns.values[selector.get_support()]
    scores = selector.scores_[selector.get_support()]
    names_scores = list(zip(names, scores))
    df_reduced = pd.DataFrame(data = names_scores, columns=['feature_names', 'mutual_info'])

    df_reduced = df_reduced.sort_values(['mutual_info', 'feature_names'], ascending = [False, True])
    print(df_reduced)
    return df_reduced.feature_names

In [4]:
frs_mi = select_k_best(mutual_info_classif)

  feature_names  mutual_info
6        Attr27     0.028791
9        Attr42     0.015943
4        Attr24     0.015930
5        Attr26     0.015656
3        Attr22     0.015163
8        Attr41     0.015017
0        Attr13     0.014908
1        Attr15     0.014380
2        Attr16     0.014263
7        Attr35     0.013563


As it is seen from the results, the most important feature in classifying whether the company becomes bankrupted after 0..5 years is the year attribute.
Features ranking:
1. X27 profit on operating activities / financial expenses
2. X42 profit on operating activities / sales
3. X24 gross profit (in 3 years) / total assets
4. X26 (net profit + depreciation) / total liabilities
5. X22 profit on operating activities / total assets
6. X41 total liabilities / ((profit on operating activities + depreciation) * (12/365))
7. X13 (gross profit + depreciation) / sales
8. X15 (total liabilities * 365) / (gross profit + depreciation)
9. X16 (gross profit + depreciation) / total liabilities
10. X35 profit on sales / total assets

In [5]:
frs_anova = select_k_best(f_classif)

  feature_names  mutual_info
3        Attr29    32.409653
0         Attr1    28.285127
4         Attr3    14.394464
5        Attr39    14.271093
7        Attr51    14.130653
1         Attr2    13.383507
9         Attr6    12.332796
8        Attr57     9.848139
6        Attr48     8.010317
2        Attr25     6.893714


In [6]:
import pickle

with open('../models/features.pkl', 'rb') as f:
    kendall_frs = pickle.load(f)
    print(kendall_frs)

['Attr61', 'Attr53', 'Attr8', 'Attr47', 'Attr17', 'Attr19', 'Attr56', 'Attr44', 'Attr33', 'Attr23', 'Attr31', 'Attr32', 'Attr2', 'Attr43', 'Attr6', 'Attr20', 'Attr60', 'Attr38', 'Attr25', 'Attr10', 'Attr29', 'Attr45', 'Attr52', 'Attr21']


XGBoost model and k-fold cross-validation, binary classification:

In [7]:
import xgboost
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
    
    
def save_model(model, frs, label, out_dir):
    model.fit(frs, label)
    pickle.dump(model, open(out_dir,'wb'))

def cross_validate(out_dir, frs, label):  
    model = pickle.load(open(out_dir,'rb'))   
    kfold = KFold(n_splits=5)
    results = cross_val_score(model, frs, label, cv=kfold)
    print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))    

Fit and save models:
1. XGBoost classifier with all features (original data with missing values);
2. Gaussian Naive Bayes (imputed missing values);
3. Random forest classifier (imputed dataset reduced to the features subset based on Kendalls' tau in range [-0.8, 0.8] selected from original data having missing values);
4. Random forest classifier (imputed dataset reduced to top-10 features subset based on mutual information gain, kNN, k=3);
5. Random forest classifier (imputed dataset reduced to top-10 features subset based on ANOVA)

In [8]:
all_frs = df[df.columns.difference(['bankruptcy_after_years', 'class'])] # with NaNs
label = df['bankruptcy_after_years']

models = [(xgboost.XGBClassifier(), all_frs, '../models/xgboost.pkl'),
          (GaussianNB(), X, '../models/gaussiannb.pkl'), #imputed NaNs
          (RandomForestClassifier(), X[kendall_frs], '../models/rand_forest_kendall.pkl'), #imputed NaNs
          (RandomForestClassifier(), X[frs_mi], '../models/rand_forest_mi.pkl'), #imputed NaNs
          (RandomForestClassifier(), X[frs_anova], '../models/rand_forest_anova.pkl') #imputed NaNs
         ]
for model, frs, out_dir in models:
    save_model(model, frs, label, out_dir)

In [9]:
for model, frs, out_dir in models:    
    cross_validate(out_dir, frs, label)

Accuracy: 95.10% (3.51%)
Accuracy: 1.73% (0.21%)
Accuracy: 94.56% (3.21%)
Accuracy: 94.60% (3.26%)
Accuracy: 94.60% (3.22%)


- As it is seen from the cross-validation results (5-fold), Gaussian Naive Bayes classifier needs tuning due to extremely low performance.
- XGBoost classifier outperforms on non-reduced data - it means that data reduction and/or imputation strategy should be tuned.
