In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, roc_curve
import pickle

In [48]:
# --- import

tw = pd.read_csv('data/tw_bankruptcy.csv')
us = pd.read_csv('data/combined.csv')

In [4]:
# --- EDA tw data

compare = tw.groupby(['Bankrupt?']).mean().T
compare['difference'] = compare.iloc[:,1] - compare.iloc[:,0]
compare['%'] = round(compare.iloc[:,2]/compare.iloc[:,1],2)
compare.sort_values(by='%', ascending=False)

In [35]:
us.head()

Unnamed: 0.1,Unnamed: 0,date,symbol,reportedCurrency,fillingDate,acceptedDate,cashAndCashEquivalents,shortTermInvestments,cashAndShortTermInvestments,netReceivables,...,Net profit before tax/Paid-in capital,Current Assets/Total Assets,Cash/Total Assets,Quick Assets/Current Liability,Cash/Current Liability,Current Liability to Assets,Total income/Total expense,Total expense/Assets,Equity to Long-term Liability,Current Liability to Current Assets
0,0,2021-03-31,CMCSA,USD,2021-04-29,2021-04-29 15:22:23,14950000000.0,0.0,14950000000.0,10986000000.0,...,60.527273,0.105698,0.053678,0.485216,0.485216,0.110628,1.139429,0.085727,0.61263,5.217746
1,1,2020-12-31,CMCSA,USD,2021-02-04,2021-02-03 18:08:14,11740000000.0,0.0,11740000000.0,11466000000.0,...,62.592593,0.097642,0.042867,0.407696,0.407696,0.105145,1.138935,0.088831,0.598285,5.734079
2,2,2020-09-30,CMCSA,USD,2020-10-29,2020-10-29 15:03:25,13707000000.0,0.0,13707000000.0,10310000000.0,...,37.388889,0.101766,0.050967,0.464471,0.464471,0.109731,1.085867,0.087428,0.576664,5.54854
3,3,2020-06-30,CMCSA,USD,2020-07-30 00:00:00,2020-07-30 18:31:48,13935000000.0,0.0,13935000000.0,10227000000.0,...,55.333333,0.103336,0.052392,0.489893,0.489893,0.106945,1.14416,0.077927,0.555126,5.557286
4,4,2020-03-31,CMCSA,USD,2020-04-30 00:00:00,2020-04-30 15:24:15,8516000000.0,0.0,8516000000.0,10800000000.0,...,39.759259,0.091776,0.032452,0.308551,0.308551,0.105175,1.087769,0.093217,0.544489,6.312822


In [4]:
tw = pd.read_csv('data/tw_clean.csv')
tw.head()

Unnamed: 0,Bankrupt?,Return on Assets,Net Value Growth Rate,Current Ratio,Quick Ratio,Total debt/Total net worth,Operating profit/Paid-in capital,Net profit before tax/Paid-in capital,Total Asset Turnover,Accounts Receivable Turnover,Quick Assets/Total Assets,Current Assets/Total Assets,Cash/Total Assets,Quick Assets/Current Liability,Cash/Current Liability,Current Liability to Assets,Total income/Total expense,Total expense/Assets,Equity to Long-term Liability,Current Liability to Current Assets
0,1,0.424389,0.000327,0.002259,0.001208,0.021266,0.095885,0.137757,0.086957,0.001814,0.166673,0.190643,0.004094,0.001997,0.000147336,0.147308,0.002022,0.064856,0.126549,0.11825
1,1,0.538214,0.000443,0.006016,0.004039,0.012502,0.093743,0.168962,0.064468,0.001286,0.127236,0.182419,0.014948,0.004136,0.00138391,0.056963,0.002226,0.025516,0.120916,0.047775
2,1,0.499019,0.000396,0.011543,0.005348,0.021248,0.092318,0.148036,0.014993,0.001495,0.340201,0.602806,0.000991,0.006302,5340000000.0,0.098162,0.00206,0.021387,0.117922,0.025346
3,1,0.451265,0.000382,0.004194,0.002896,0.009572,0.077727,0.147561,0.089955,0.001966,0.161575,0.225815,0.018851,0.002961,0.001010646,0.098715,0.001831,0.024161,0.12076,0.06725
4,1,0.538432,0.000439,0.006022,0.003727,0.00515,0.096927,0.167461,0.175412,0.001449,0.26033,0.35838,0.014161,0.004275,0.000680464,0.110195,0.002224,0.026385,0.110933,0.047725


In [62]:
# --- Checking zero columns to avoid division to infinity
us[['totalStockholdersEquity','totalAssets','totalCurrentLiabilities','totalCurrentAssets']][us[['totalStockholdersEquity','totalAssets','totalCurrentLiabilities','totalCurrentAssets']]==0].count()

totalStockholdersEquity     1787
totalAssets                  479
totalCurrentLiabilities    19760
totalCurrentAssets         22628
dtype: int64

In [64]:
us_nozero = us.drop(us[us[['totalStockholdersEquity','totalAssets','totalCurrentLiabilities','totalCurrentAssets']]==0])
len(us_nozero)

In [49]:
# --- Calculation for selected criteria
us['Operating profit/Paid-in capital']=us['operatingIncome']/us['totalStockholdersEquity']
us['Net profit before tax/Paid-in capital']=us['netIncome']/us['totalStockholdersEquity']
us['Current Assets/Total Assets']=us['totalCurrentAssets']/us['totalAssets']
us['Cash/Total Assets']=us['cashAndCashEquivalents']/us['totalAssets']
us['Quick Assets/Current Liability']=us['cashAndShortTermInvestments']/us['totalCurrentLiabilities']
us['Cash/Current Liability']=us['cashAndCashEquivalents']/us['totalCurrentLiabilities']
us['Current Liability to Assets']=us['totalCurrentLiabilities']/us['totalAssets']
us['Total income/Total expense']=us['revenue']/(us['revenue']-us['netIncome'])
us['Total expense/Assets']=(us['revenue']-us['netIncome'])/us['totalAssets']
us['Current Liability to Current Assets']=us['totalNonCurrentLiabilities']/us['totalCurrentAssets']


In [50]:
# --- Selecting useful columns for prediction
rename = {'returnOnAssets': 'Return on Assets',
'bookValueperShareGrowth': 'Net Value Growth Rate',
'currentRatio': 'Current Ratio',
'quickRatio': 'Quick Ratio',
'debtEquityRatio': 'Total debt/Total net worth'}
us.rename(columns=rename, inplace=True)


In [51]:
# --- Selected specific columns for data purpose
temp = ['symbol','year','period','quarter']+['Operating profit/Paid-in capital','Net profit before tax/Paid-in capital','Current Assets/Total Assets','Cash/Total Assets','Quick Assets/Current Liability','Cash/Current Liability','Current Liability to Assets','Total income/Total expense','Total expense/Assets', 'Current Liability to Current Assets']+list(rename.values())
us_tw = us[temp]
us_tw.head()

Unnamed: 0,symbol,year,period,quarter,Operating profit/Paid-in capital,Net profit before tax/Paid-in capital,Current Assets/Total Assets,Cash/Total Assets,Quick Assets/Current Liability,Cash/Current Liability,Current Liability to Assets,Total income/Total expense,Total expense/Assets,Current Liability to Current Assets,Return on Assets,Net Value Growth Rate,Current Ratio,Quick Ratio,Total debt/Total net worth
0,CMCSA,2021,1,81,0.054432,0.03596,0.105698,0.053678,0.485216,0.485216,0.110628,1.139429,0.085727,5.217746,0.011953,0.00685,0.955438,0.841777,1.992017
1,CMCSA,2020,4,80,0.042709,0.036844,0.097642,0.042867,0.407696,0.407696,0.105145,1.138935,0.088831,5.734079,0.012342,0.046748,0.928636,0.805876,1.985339
2,CMCSA,2020,3,79,0.046545,0.023056,0.101766,0.050967,0.464471,0.464471,0.109731,1.085867,0.087428,5.54854,0.007507,0.039556,0.927417,0.813832,2.071108
3,CMCSA,2020,2,78,0.054805,0.03524,0.103336,0.052392,0.489893,0.489893,0.106945,1.14416,0.077927,5.557286,0.011234,0.026479,0.966251,0.849429,2.136866
4,CMCSA,2020,1,77,0.058611,0.025935,0.091776,0.032452,0.308551,0.308551,0.105175,1.087769,0.093217,6.312822,0.008182,-0.011939,0.872609,0.699855,2.169987


In [56]:
us_tw[us_tw == 0].count()

symbol                                       0
year                                         0
period                                    9240
quarter                                   1833
Operating profit/Paid-in capital         40388
Net profit before tax/Paid-in capital     7593
Current Assets/Total Assets              22155
Cash/Total Assets                         7272
Quick Assets/Current Liability           18996
Cash/Current Liability                    3858
Current Liability to Assets              19476
Total income/Total expense               11633
Total expense/Assets                      1189
Current Liability to Current Assets      23056
Return on Assets                          1636
Net Value Growth Rate                    59351
Current Ratio                             9927
Quick Ratio                                739
Total debt/Total net worth                 348
dtype: int64

In [52]:
us_tw[us_tw == float('inf')].count()

symbol                                       0
year                                         0
period                                       0
quarter                                      0
Operating profit/Paid-in capital          1047
Net profit before tax/Paid-in capital     1312
Current Assets/Total Assets                 12
Cash/Total Assets                           30
Quick Assets/Current Liability            6674
Cash/Current Liability                   15875
Current Liability to Assets                202
Total income/Total expense                  29
Total expense/Assets                       464
Current Liability to Current Assets       9242
Return on Assets                             0
Net Value Growth Rate                        0
Current Ratio                                0
Quick Ratio                                  0
Total debt/Total net worth                   0
dtype: int64

In [44]:
us[us == 0].count().sort_values(ascending=False).head(20)

acquisitionsNet                       289624
taxPayables                           284889
deferredRevenueNonCurrent             263724
otherLiabilities                      256146
othertotalStockholdersEquity          256039
sellingAndMarketingExpenses           243189
otherAssets                           237643
totalInvestments                      227982
taxAssets                             222646
researchAndDevelopmentExpenses        215660
otherExpenses                         213559
rdexpenseGrowth                       213511
deferredRevenue                       210732
longTermInvestments                   205310
shortTermInvestments                  202904
tenYDividendperShareGrowthPerShare    202486
commonStockIssued                     199129
commonStockRepurchased                197816
salesMaturitiesOfInvestments          194322
effectOfForexChangesOnCash            190120
dtype: int64

In [20]:
us_tw.to_csv('data/us_bankruptcy.csv')

In [22]:
us_tw=pd.read_csv('data/us_bankruptcy.csv')
us_tw.head()

Unnamed: 0.1,Unnamed: 0,symbol,year,period,quarter,Operating profit/Paid-in capital,Net profit before tax/Paid-in capital,Current Assets/Total Assets,Cash/Total Assets,Quick Assets/Current Liability,...,Current Liability to Assets,Total income/Total expense,Total expense/Assets,Equity to Long-term Liability,Current Liability to Current Assets,Return on Assets,Net Value Growth Rate,Current Ratio,Quick Ratio,Total debt/Total net worth
0,0,CMCSA,2021,1,81,91.618182,60.527273,0.105698,0.053678,0.485216,...,0.110628,1.139429,0.085727,0.61263,5.217746,0.011953,0.00685,0.955438,0.841777,1.992017
1,1,CMCSA,2020,4,80,72.555556,62.592593,0.097642,0.042867,0.407696,...,0.105145,1.138935,0.088831,0.598285,5.734079,0.012342,0.046748,0.928636,0.805876,1.985339
2,2,CMCSA,2020,3,79,75.481481,37.388889,0.101766,0.050967,0.464471,...,0.109731,1.085867,0.087428,0.576664,5.54854,0.007507,0.039556,0.927417,0.813832,2.071108
3,3,CMCSA,2020,2,78,86.055556,55.333333,0.103336,0.052392,0.489893,...,0.106945,1.14416,0.077927,0.555126,5.557286,0.011234,0.026479,0.966251,0.849429,2.136866
4,4,CMCSA,2020,1,77,89.851852,39.759259,0.091776,0.032452,0.308551,...,0.105175,1.087769,0.093217,0.544489,6.312822,0.008182,-0.011939,0.872609,0.699855,2.169987


In [23]:
us_tw.describe()

Unnamed: 0.1,Unnamed: 0,year,period,quarter,Operating profit/Paid-in capital,Net profit before tax/Paid-in capital,Current Assets/Total Assets,Cash/Total Assets,Quick Assets/Current Liability,Cash/Current Liability,Current Liability to Assets,Total income/Total expense,Total expense/Assets,Equity to Long-term Liability,Current Liability to Current Assets,Return on Assets,Net Value Growth Rate,Current Ratio,Quick Ratio,Total debt/Total net worth
count,296739.0,296739.0,296739.0,296739.0,288803.0,295103.0,296212.0,296230.0,283654.0,292856.0,296402.0,295577.0,296677.0,296214.0,283353.0,296530.0,296739.0,278667.0,278667.0,295255.0
mean,148369.0,2009.264987,2.383138,35.443086,,,inf,inf,,,inf,,,,inf,-5.976336,461.6421,-77667.8,894826200000.0,3.083922
std,85661.315102,7.986324,1.174034,31.938396,,,,,,,,,,,,4962.347,123535.4,10616310.0,472369100000000.0,259.060176
min,0.0,1985.0,0.0,-62.0,-inf,-inf,-55540790.0,-4.793911,-inf,-inf,-17122.83,-inf,-inf,-inf,-194250.0,-2527997.0,-5133639.0,-1602356000.0,-440665.5,-29872.125
25%,74184.5,2004.0,1.0,13.0,0.0,-0.02027054,0.139011,0.01943571,0.1030127,0.09075713,0.09266648,0.9692266,0.01638497,0.8699782,0.08774006,-0.003072268,-0.01200951,1.047667,0.5813949,0.424528
50%,148369.0,2011.0,2.0,41.0,1.58,3.044,0.4104223,0.05966389,0.4210006,0.3724186,0.1858829,1.049878,0.1174971,2.478448,0.5209639,0.005720496,0.0,1.789673,1.14803,1.106248
75%,222553.5,2016.0,3.0,62.0,288.7907,289.9239,0.6760313,0.1629151,1.491505,1.255088,0.3279496,1.152022,0.6876658,14.85632,1.745627,0.01843745,0.03417328,3.214441,2.291083,2.613913
max,296738.0,2021.0,4.0,82.0,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,866153.8,54063760.0,3547710.0,2.493585e+17,87701.5


In [10]:
tw_clean=pd.read_csv('data/tw_clean.csv')
tw_clean.head()

Unnamed: 0,Bankrupt?,Return on Assets,Net Value Growth Rate,Current Ratio,Quick Ratio,Total debt/Total net worth,Operating profit/Paid-in capital,Net profit before tax/Paid-in capital,Total Asset Turnover,Accounts Receivable Turnover,Quick Assets/Total Assets,Current Assets/Total Assets,Cash/Total Assets,Quick Assets/Current Liability,Cash/Current Liability,Current Liability to Assets,Total income/Total expense,Total expense/Assets,Equity to Long-term Liability,Current Liability to Current Assets
0,1,0.424389,0.000327,0.002259,0.001208,0.021266,0.095885,0.137757,0.086957,0.001814,0.166673,0.190643,0.004094,0.001997,0.000147336,0.147308,0.002022,0.064856,0.126549,0.11825
1,1,0.538214,0.000443,0.006016,0.004039,0.012502,0.093743,0.168962,0.064468,0.001286,0.127236,0.182419,0.014948,0.004136,0.00138391,0.056963,0.002226,0.025516,0.120916,0.047775
2,1,0.499019,0.000396,0.011543,0.005348,0.021248,0.092318,0.148036,0.014993,0.001495,0.340201,0.602806,0.000991,0.006302,5340000000.0,0.098162,0.00206,0.021387,0.117922,0.025346
3,1,0.451265,0.000382,0.004194,0.002896,0.009572,0.077727,0.147561,0.089955,0.001966,0.161575,0.225815,0.018851,0.002961,0.001010646,0.098715,0.001831,0.024161,0.12076,0.06725
4,1,0.538432,0.000439,0.006022,0.003727,0.00515,0.096927,0.167461,0.175412,0.001449,0.26033,0.35838,0.014161,0.004275,0.000680464,0.110195,0.002224,0.026385,0.110933,0.047725


In [11]:
# --- Split dataset and oversample
y = tw_clean.iloc[:,0]
X = tw_clean.iloc[:,1:]

oversample = SMOTE()
X,y = oversample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [15]:
# --- Scaling data

scaler = StandardScaler()

scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
us_clean_scale = scaler.transform(us_clean.iloc[:,5:])
us_clean_scale = us_clean.iloc[:,0:5] + us_clean_scale
us_clean_scale = pd.to_csv('data/us_clean_scale.csv')

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


ValueError: Input contains infinity or a value too large for dtype('float64').

In [None]:
# --- Scaling data for prediction purpose



In [None]:
# --- 1. Logistic regression

lgr = LogisticRegression(max_iter=10000000)
criteria = {'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'C': [0.01, 0.1, 1, 10, 100]}
clf = GridSearchCV(lgr, param_grid=criteria, cv=20)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(clf.best_params_)

# probs = clf.predict_proba(X_test)
# probs = probs[:, 1]
# auc = roc_auc_score(y_test, probs)
# print('Area Under Curve - Test Set: %.2f%%' % (auc*100))
# fpr, tpr, thresholds = roc_curve(y_test, probs)
# plt.plot([0, 1], [0, 1], linestyle='--')
# plt.plot(fpr, tpr, marker='.')
# plt.show()

In [12]:
# --- 2. Decision tree

tree = DecisionTreeClassifier()
criteria = {'criterion':['gini', 'entropy'], 'max_depth':[5,10,15,20,25,30]}
clf = GridSearchCV(tree, param_grid=criteria, cv=20)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(clf.best_params_)


# probs = clf.predict_proba(X_test)
# probs = probs[:, 1]
# auc = roc_auc_score(y_test, probs)
# print('Area Under Curve - Test Set: %.2f%%' % (auc*100))
# fpr, tpr, thresholds = roc_curve(y_test, probs)
# plt.plot([0, 1], [0, 1], linestyle='--')
# plt.plot(fpr, tpr, marker='.')
# plt.show()

              precision    recall  f1-score   support

           0       0.96      0.92      0.94      1343
           1       0.93      0.96      0.94      1297

    accuracy                           0.94      2640
   macro avg       0.94      0.94      0.94      2640
weighted avg       0.94      0.94      0.94      2640

{'criterion': 'entropy', 'max_depth': 15}


In [14]:
filename = 'tree.pkl'
pickle.dump(clf, open(filename, 'wb'))

In [12]:
# --- 3. Random forest

forest = RandomForestClassifier()
criteria = {'bootstrap': [True, False], 'max_features': ['auto', 'sqrt'], 'n_estimators':[50,100,150,200], 'max_depth': [20, 40, 60, 80, 100]}
# 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10]
clf = GridSearchCV(forest, param_grid=criteria, cv=20)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(clf.best_params_)

              precision    recall  f1-score   support

           0       0.99      0.95      0.97      1343
           1       0.95      0.99      0.97      1297

    accuracy                           0.97      2640
   macro avg       0.97      0.97      0.97      2640
weighted avg       0.97      0.97      0.97      2640

{'bootstrap': False, 'max_depth': 40, 'max_features': 'auto', 'n_estimators': 50}


In [13]:
# --- 4. Adaboost

ada = AdaBoostClassifier()
criteria = {'n_estimators':[50,100,150,200,250,300], 'learning_rate':[0.001, 0.01, 0.1, 1, 10]}
clf = GridSearchCV(ada, param_grid=criteria, cv=20)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(clf.best_params_)

              precision    recall  f1-score   support

           0       0.94      0.91      0.93      1343
           1       0.91      0.94      0.93      1297

    accuracy                           0.93      2640
   macro avg       0.93      0.93      0.93      2640
weighted avg       0.93      0.93      0.93      2640

{'learning_rate': 1.0, 'n_estimators': 250}


In [14]:
# --- 5. XGBoost

xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
criteria = {'booster':['gbtree','gblinear','dart'], 'learning_rate':[0.001, 0.01, 0.1, 0.2, 0.3]}
clf = GridSearchCV(xgb, param_grid=criteria, cv=20)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(clf.best_params_)

              precision    recall  f1-score   support

           0       0.99      0.94      0.97      1343
           1       0.94      0.99      0.97      1297

    accuracy                           0.97      2640
   macro avg       0.97      0.97      0.97      2640
weighted avg       0.97      0.97      0.97      2640

{'booster': 'gbtree'}


In [6]:
# --- 6. SVM

svm = SVC()
criteria = {'kernel':['linear','poly','rbf','sigmoid'], 'C':[1000,100,10,0.1]}
clf = GridSearchCV(svm, param_grid=criteria, cv=20)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(clf.best_params_)

# --- export model
pickle_out = open("classifier.pkl", mode = "wb") 
pickle.dump(clf, pickle_out) 
pickle_out.close()

KeyboardInterrupt: 