In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [11]:
data = pd.read_csv('datasets/american_bankruptcy.csv')
data.head()

Unnamed: 0,company_name,status_label,year,X1,X2,X3,X4,X5,X6,X7,...,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18
0,C_1,alive,1999,511.267,833.107,18.373,89.031,336.018,35.163,128.348,...,1024.333,740.998,180.447,70.658,191.226,163.816,201.026,1024.333,401.483,935.302
1,C_1,alive,2000,485.856,713.811,18.577,64.367,320.59,18.531,115.187,...,874.255,701.854,179.987,45.79,160.444,125.392,204.065,874.255,361.642,809.888
2,C_1,alive,2001,436.656,526.477,22.496,27.207,286.588,-58.939,77.528,...,638.721,710.199,217.699,4.711,112.244,150.464,139.603,638.721,399.964,611.514
3,C_1,alive,2002,396.412,496.747,27.172,30.745,259.954,-12.41,66.322,...,606.337,686.621,164.658,3.573,109.59,203.575,124.106,606.337,391.633,575.592
4,C_1,alive,2003,432.204,523.302,26.68,47.491,247.245,3.504,104.661,...,651.958,709.292,248.666,20.811,128.656,131.261,131.884,651.958,407.608,604.467


In [12]:
# Data preprocessing
X = data.drop(['status_label','company_name', "year"], axis=1)  # Features
y = data['status_label']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(X_train_scaled.shape)

(62945, 18)


In [13]:
model = LGBMClassifier()
model.fit(X_train_scaled, y_train)
 
pred = model.predict(X_test_scaled)
print(pred)

[LightGBM] [Info] Number of positive: 4152, number of negative: 58793
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4590
[LightGBM] [Info] Number of data points in the train set: 62945, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.065962 -> initscore=-2.650433
[LightGBM] [Info] Start training from score -2.650433
['alive' 'alive' 'alive' ... 'alive' 'alive' 'alive']


In [14]:
print('Training accuracy {:.4f}'.format(model.score(X_train_scaled,y_train)))
print('Testing accuracy {:.4f}'.format(model.score(X_test_scaled,y_test)))

Training accuracy 0.9421
Testing accuracy 0.9348


In [15]:
print(metrics.classification_report(y_test,model.predict(X_test_scaled)))

              precision    recall  f1-score   support

       alive       0.93      1.00      0.97     14669
      failed       0.89      0.04      0.09      1068

    accuracy                           0.93     15737
   macro avg       0.91      0.52      0.53     15737
weighted avg       0.93      0.93      0.91     15737

