# Bankruptcy classifier

We will train a random forest classifier on `data/companies.csv`
and save the trained model to disk.

In [1]:
import pandas

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib

Get the data ...

In [2]:
df = pandas.read_csv('../data/companies.csv')

In [16]:
df.is_bankrupt.value_counts()

False    143
True     107
Name: is_bankrupt, dtype: int64

Prepare for training ...

In [3]:
target_column = 'is_bankrupt'
id_column = 'company_id'

feature_columns = list(df.columns)
feature_columns.remove(target_column)
feature_columns.remove(id_column)

y = df[target_column]
X = df[feature_columns]

In [6]:
X.head()

Unnamed: 0,competitiveness,credibility,financial_flexibility,industrial_risk,management_risk,operating_risk
0,A,A,A,P,P,P
1,A,A,A,N,N,N
2,A,A,A,A,A,A
3,P,P,P,P,P,P
4,P,P,P,N,N,N


In [7]:
# encode categoricals
X = pandas.get_dummies(X)
X.head()

Unnamed: 0,competitiveness_A,competitiveness_N,competitiveness_P,credibility_A,credibility_N,credibility_P,financial_flexibility_A,financial_flexibility_N,financial_flexibility_P,industrial_risk_A,industrial_risk_N,industrial_risk_P,management_risk_A,management_risk_N,management_risk_P,operating_risk_A,operating_risk_N,operating_risk_P
0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1
1,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0
2,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0
3,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1
4,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0


Split into training and test examples ...

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Train ...

In [18]:
clf = RandomForestClassifier(max_depth=2, random_state=420)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=420,
            verbose=0, warm_start=False)

Predict test examples ...

In [19]:
y_predicted = clf.predict(X_test)

In [20]:
clf.score(X_test,y_test)

0.95999999999999996

What's the accuracy?

In [21]:
accuracy = (y_predicted == y_test).sum()/len(y_test)
print('Accuracy: {}'.format(accuracy))

Accuracy: 0.96


What are the feature importances?

In [22]:
for i in clf.feature_importances_:
    print(i)

0.0680129906163
0.198740695395
0.0195225627591
0.0
0.107120299781
0.00424633003343
0.0660270193178
0.465365664843
0.0377981918033
0.0
0.029867966444
0.0
0.0
0.00329827900595
0.0
0.0
0.0
0.0


Save trained model ...

In [23]:
joblib.dump(clf, '../data/model.pkl')

['../data/model.pkl']

In [None]:
# This is how you load it
# clf = joblib.load('data/bankruptcy.pkl') 

First save this notebook then publish it as a report so you and your friends can keep track of how well your model is doing. Click "Publish Report" above (top right of this page). You can then view it by going to the "Reports" tab on the right.