## Week 6. Case Study - Python

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier,AdaBoostClassifier
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve, auc 

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from PrepareData import DataPreparation
from BuildModel import ModelResults

In [5]:
data = pd.read_csv('data.csv')
df = data.copy()
df.head(3)

Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,1,0.370594,0.424389,0.40575,0.601457,0.601457,0.998969,0.796887,0.808809,0.302646,...,0.716845,0.009219,0.622879,0.601453,0.82789,0.290202,0.026601,0.56405,1,0.016469
1,1,0.464291,0.538214,0.51673,0.610235,0.610235,0.998946,0.79738,0.809301,0.303556,...,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1,0.020794
2,1,0.426071,0.499019,0.472295,0.60145,0.601364,0.998857,0.796403,0.808388,0.302035,...,0.77467,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1,0.016474


### Clean column names

In [6]:
cleaner = DataPreparation(df)

In [7]:
df = cleaner.clean_column_names()
df.columns

Index(['bankrupt_', '_roa_c__before_interest_and_depreciation_before_interest',
       '_roa_a__before_interest_and___after_tax',
       '_roa_b__before_interest_and_depreciation_after_tax',
       '_operating_gross_margin', '_realized_sales_gross_margin',
       '_operating_profit_rate', '_pre_tax_net_interest_rate',
       '_after_tax_net_interest_rate',
       '_non_industry_income_and_expenditure_revenue',
       '_continuous_interest_rate__after_tax_', '_operating_expense_rate',
       '_research_and_development_expense_rate', '_cash_flow_rate',
       '_interest_bearing_debt_interest_rate', '_tax_rate__a_',
       '_net_value_per_share__b_', '_net_value_per_share__a_',
       '_net_value_per_share__c_', '_persistent_eps_in_the_last_four_seasons',
       '_cash_flow_per_share', '_revenue_per_share__yuan___',
       '_operating_profit_per_share__yuan___',
       '_per_share_net_profit_before_tax__yuan___',
       '_realized_sales_gross_profit_growth_rate',
       '_operating_profit

In [10]:
df = df.rename(columns={'bankrupt_': 'bankrupt?'})
df.columns

Index(['bankrupt?', '_roa_c__before_interest_and_depreciation_before_interest',
       '_roa_a__before_interest_and___after_tax',
       '_roa_b__before_interest_and_depreciation_after_tax',
       '_operating_gross_margin', '_realized_sales_gross_margin',
       '_operating_profit_rate', '_pre_tax_net_interest_rate',
       '_after_tax_net_interest_rate',
       '_non_industry_income_and_expenditure_revenue',
       '_continuous_interest_rate__after_tax_', '_operating_expense_rate',
       '_research_and_development_expense_rate', '_cash_flow_rate',
       '_interest_bearing_debt_interest_rate', '_tax_rate__a_',
       '_net_value_per_share__b_', '_net_value_per_share__a_',
       '_net_value_per_share__c_', '_persistent_eps_in_the_last_four_seasons',
       '_cash_flow_per_share', '_revenue_per_share__yuan___',
       '_operating_profit_per_share__yuan___',
       '_per_share_net_profit_before_tax__yuan___',
       '_realized_sales_gross_profit_growth_rate',
       '_operating_profit

### Data Types

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6819 entries, 0 to 6818
Data columns (total 96 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   bankrupt?                                                 6819 non-null   int64  
 1   _roa_c__before_interest_and_depreciation_before_interest  6819 non-null   float64
 2   _roa_a__before_interest_and___after_tax                   6819 non-null   float64
 3   _roa_b__before_interest_and_depreciation_after_tax        6819 non-null   float64
 4   _operating_gross_margin                                   6819 non-null   float64
 5   _realized_sales_gross_margin                              6819 non-null   float64
 6   _operating_profit_rate                                    6819 non-null   float64
 7   _pre_tax_net_interest_rate                                6819 non-null   float64
 8   _after_tax_net_int

### Missing values

In [12]:
missing_values = cleaner.visualize_missing(visualize=False, fig_height = 20, fig_width = 15)
missing_values

Unnamed: 0,num_of_missing,%_of_total
bankrupt_,0.0,0.0
_roa_c__before_interest_and_depreciation_before_interest,0.0,0.0
_roa_a__before_interest_and___after_tax,0.0,0.0
_roa_b__before_interest_and_depreciation_after_tax,0.0,0.0
_operating_gross_margin,0.0,0.0
...,...,...
_liability_to_equity,0.0,0.0
_degree_of_financial_leverage__dfl_,0.0,0.0
_interest_coverage_ratio__interest_expense_to_ebit_,0.0,0.0
_net_income_flag,0.0,0.0


In [13]:
missing_values[missing_values['num_of_missing']>0].shape[0]

0

No missing values.

### Duplicated values

In [14]:
cleaner.check_duplicates()

Numer of duplicated values:  0


Unnamed: 0,bankrupt_,_roa_c__before_interest_and_depreciation_before_interest,_roa_a__before_interest_and___after_tax,_roa_b__before_interest_and_depreciation_after_tax,_operating_gross_margin,_realized_sales_gross_margin,_operating_profit_rate,_pre_tax_net_interest_rate,_after_tax_net_interest_rate,_non_industry_income_and_expenditure_revenue,...,_net_income_to_total_assets,_total_assets_to_gnp_price,_no_credit_interval,_gross_profit_to_sales,_net_income_to_stockholder_s_equity,_liability_to_equity,_degree_of_financial_leverage__dfl_,_interest_coverage_ratio__interest_expense_to_ebit_,_net_income_flag,_equity_to_liability


No duplicated values

### Correlations

In [15]:
corr_table = df.corr()

In [16]:
corr_table = corr_table[['bankrupt?']].sort_values(by='bankrupt?', ascending=False)
corr_table

Unnamed: 0,bankrupt?
bankrupt?,1.000000
_debt_ratio__,0.250161
_current_liability_to_assets,0.194494
_borrowing_dependency,0.176543
_current_liability_to_current_assets,0.171306
...,...
_roa_c__before_interest_and_depreciation_before_interest,-0.260807
_roa_b__before_interest_and_depreciation_after_tax,-0.273051
_roa_a__before_interest_and___after_tax,-0.282941
_net_income_to_total_assets,-0.315457


### Bagging Models

In [17]:
X = df.drop('bankrupt?',axis=1)
y = df['bankrupt?']

In [18]:
modeller = ModelResults()

#### Random Forest

In [19]:
res, res_for_plot = modeller.ClassificationModels(X,y,"RandomForest",
        RandomForestClassifier(n_estimators=150, random_state=111))

In [20]:
results_df = modeller.MetricsDf()
results_df

Unnamed: 0,RandomForest
accuracy,0.973607
precision,0.6
recall,0.206897
roc_auc,0.914981
gini,0.829963


#### BaggingSVM

In [48]:
res, res_for_plot = modeller.ClassificationModels(X,y,"BaggingSVM(rbf)",
        BaggingClassifier(estimator=SVC(kernel='rbf', probability=True)))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [49]:
results_df = modeller.MetricsDf()
results_df

Unnamed: 0,RandomForest,BaggingSVM(rbf),ExtraTreeBagging,Adaboost,XGBoost,CatBoost,LightGBM
accuracy,0.973607,0.971652,0.973118,0.967742,0.971163,0.974585,0.974585
precision,0.6,0.0,0.6,0.416667,0.487179,0.607143,0.6
recall,0.206897,0.0,0.155172,0.344828,0.327586,0.293103,0.310345
roc_auc,0.914981,0.57201,0.929382,0.929534,0.943055,0.940453,0.948579
gini,0.829963,0.144019,0.858765,0.859068,0.88611,0.880906,0.897159


#### ExtraTreeClassifier

In [23]:
res, res_for_plot = modeller.ClassificationModels(X,y,"ExtraTreeBagging",
        ExtraTreesClassifier(n_estimators=100,random_state=111))

In [24]:
results_df = modeller.MetricsDf()
results_df

Unnamed: 0,RandomForest,BaggingSVM(rbf),ExtraTreeBagging
accuracy,0.973607,0.971652,0.973118
precision,0.6,0.0,0.6
recall,0.206897,0.0,0.155172
roc_auc,0.914981,0.578679,0.929382
gini,0.829963,0.157358,0.858765


#### BaggingLogistic

### Boosting Algorithms

#### Adaboost

In [29]:
res, res_for_plot = modeller.ClassificationModels(X,y,"Adaboost",
        AdaBoostClassifier(n_estimators=100,random_state=111))

In [30]:
results_df = modeller.MetricsDf()
results_df

Unnamed: 0,RandomForest,BaggingSVM(rbf),ExtraTreeBagging,Adaboost
accuracy,0.973607,0.971652,0.973118,0.967742
precision,0.6,0.0,0.6,0.416667
recall,0.206897,0.0,0.155172,0.344828
roc_auc,0.914981,0.578679,0.929382,0.929534
gini,0.829963,0.157358,0.858765,0.859068


#### XGBoost

In [31]:
res, res_for_plot = modeller.ClassificationModels(X,y,"XGBoost",
        XGBClassifier(n_estimators=100,random_state=111))

In [32]:
results_df = modeller.MetricsDf()
results_df

Unnamed: 0,RandomForest,BaggingSVM(rbf),ExtraTreeBagging,Adaboost,XGBoost
accuracy,0.973607,0.971652,0.973118,0.967742,0.971163
precision,0.6,0.0,0.6,0.416667,0.487179
recall,0.206897,0.0,0.155172,0.344828,0.327586
roc_auc,0.914981,0.578679,0.929382,0.929534,0.943055
gini,0.829963,0.157358,0.858765,0.859068,0.88611


#### Catboost

In [37]:
res, res_for_plot = modeller.ClassificationModels(X,y,"CatBoost",
        CatBoostClassifier(n_estimators=150,random_state=111))

Learning rate set to 0.114367
0:	learn: 0.4883129	total: 39.6ms	remaining: 5.91s
1:	learn: 0.3527190	total: 57.8ms	remaining: 4.28s
2:	learn: 0.2710804	total: 71.2ms	remaining: 3.49s
3:	learn: 0.2126300	total: 85.2ms	remaining: 3.11s
4:	learn: 0.1708074	total: 98.7ms	remaining: 2.86s
5:	learn: 0.1460868	total: 111ms	remaining: 2.67s
6:	learn: 0.1288593	total: 124ms	remaining: 2.54s
7:	learn: 0.1160329	total: 138ms	remaining: 2.46s
8:	learn: 0.1076554	total: 151ms	remaining: 2.37s
9:	learn: 0.1014754	total: 165ms	remaining: 2.31s
10:	learn: 0.0961702	total: 179ms	remaining: 2.26s
11:	learn: 0.0905498	total: 193ms	remaining: 2.21s
12:	learn: 0.0873348	total: 209ms	remaining: 2.2s
13:	learn: 0.0827135	total: 223ms	remaining: 2.16s
14:	learn: 0.0792408	total: 237ms	remaining: 2.13s
15:	learn: 0.0767163	total: 250ms	remaining: 2.1s
16:	learn: 0.0742782	total: 264ms	remaining: 2.06s
17:	learn: 0.0715311	total: 277ms	remaining: 2.03s
18:	learn: 0.0701097	total: 292ms	remaining: 2.01s
19:	lear

In [38]:
results_df = modeller.MetricsDf()
results_df

Unnamed: 0,RandomForest,BaggingSVM(rbf),ExtraTreeBagging,Adaboost,XGBoost,CatBoost
accuracy,0.973607,0.971652,0.973118,0.967742,0.971163,0.974585
precision,0.6,0.0,0.6,0.416667,0.487179,0.607143
recall,0.206897,0.0,0.155172,0.344828,0.327586,0.293103
roc_auc,0.914981,0.578679,0.929382,0.929534,0.943055,0.940453
gini,0.829963,0.157358,0.858765,0.859068,0.88611,0.880906


#### LightGBM

In [39]:
res, res_for_plot = modeller.ClassificationModels(X,y,"LightGBM",
        LGBMClassifier(n_estimators=150,random_state=111))

[LightGBM] [Info] Number of positive: 162, number of negative: 4611
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003906 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23714
[LightGBM] [Info] Number of data points in the train set: 4773, number of used features: 93
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.033941 -> initscore=-3.348604
[LightGBM] [Info] Start training from score -3.348604


In [40]:
results_df = modeller.MetricsDf()
results_df

Unnamed: 0,RandomForest,BaggingSVM(rbf),ExtraTreeBagging,Adaboost,XGBoost,CatBoost,LightGBM
accuracy,0.973607,0.971652,0.973118,0.967742,0.971163,0.974585,0.974585
precision,0.6,0.0,0.6,0.416667,0.487179,0.607143,0.6
recall,0.206897,0.0,0.155172,0.344828,0.327586,0.293103,0.310345
roc_auc,0.914981,0.578679,0.929382,0.929534,0.943055,0.940453,0.948579
gini,0.829963,0.157358,0.858765,0.859068,0.88611,0.880906,0.897159


### Summary

Based on the results,  we can tell that Adaboost is the best for our case in terms of Recall.