In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [11]:
data = pd.read_csv('datasets/american_bankruptcy.csv')
data.head()

Unnamed: 0,company_name,status_label,year,X1,X2,X3,X4,X5,X6,X7,...,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18
0,C_1,alive,1999,511.267,833.107,18.373,89.031,336.018,35.163,128.348,...,1024.333,740.998,180.447,70.658,191.226,163.816,201.026,1024.333,401.483,935.302
1,C_1,alive,2000,485.856,713.811,18.577,64.367,320.59,18.531,115.187,...,874.255,701.854,179.987,45.79,160.444,125.392,204.065,874.255,361.642,809.888
2,C_1,alive,2001,436.656,526.477,22.496,27.207,286.588,-58.939,77.528,...,638.721,710.199,217.699,4.711,112.244,150.464,139.603,638.721,399.964,611.514
3,C_1,alive,2002,396.412,496.747,27.172,30.745,259.954,-12.41,66.322,...,606.337,686.621,164.658,3.573,109.59,203.575,124.106,606.337,391.633,575.592
4,C_1,alive,2003,432.204,523.302,26.68,47.491,247.245,3.504,104.661,...,651.958,709.292,248.666,20.811,128.656,131.261,131.884,651.958,407.608,604.467


In [12]:
# Data preprocessing
X = data.drop(['status_label','company_name', "year"], axis=1)  # Features
y = data['status_label']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(X_train_scaled.shape)

(62945, 18)


In [13]:
model = LGBMClassifier()
model.fit(X_train_scaled, y_train)
 
pred = model.predict(X_test_scaled)
print(pred)

[LightGBM] [Info] Number of positive: 4152, number of negative: 58793
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4590
[LightGBM] [Info] Number of data points in the train set: 62945, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.065962 -> initscore=-2.650433
[LightGBM] [Info] Start training from score -2.650433
['alive' 'alive' 'alive' ... 'alive' 'alive' 'alive']


In [14]:
print('Training accuracy {:.4f}'.format(model.score(X_train_scaled,y_train)))
print('Testing accuracy {:.4f}'.format(model.score(X_test_scaled,y_test)))

Training accuracy 0.9421
Testing accuracy 0.9348


In [15]:
print(metrics.classification_report(y_test,model.predict(X_test_scaled)))

              precision    recall  f1-score   support

       alive       0.93      1.00      0.97     14669
      failed       0.89      0.04      0.09      1068

    accuracy                           0.93     15737
   macro avg       0.91      0.52      0.53     15737
weighted avg       0.93      0.93      0.91     15737



<font size="6">Testing LightGBM model with filled_table</font>

In [39]:
from google.oauth2 import service_account
import pandas as pd
import pandas_gbq as gbq
credentials_path = 'data_cleaning/token.json'

# Authenticate with your credentials
credentials = service_account.Credentials.from_service_account_file(
    credentials_path, scopes=['https://www.googleapis.com/auth/bigquery'])

# Set the credentials for pandas_gbq
gbq.context.credentials = credentials

In [61]:
project_id = 'capstone-398012'
dataset_id = 'capstone'
table_id = "filled_table"

In [62]:
from pandas_gbq import read_gbq

query = f"""
SELECT *
FROM `{project_id}.{dataset_id}.{table_id}`

"""

# Authenticate and read data from BigQuery into a DataFrame
df = read_gbq(query, project_id=project_id, dialect='standard')
df

  record_batch = self.to_arrow(


Unnamed: 0,CompNo,yyyy,mm,StkIndx,STInt,dtdlevel,dtdtrend,liqnonfinlevel,liqnonfintrend,ni2talevel,...,Sector_Number_notNA,DTDmedianFin_notNA,sigma_notNA,m2b_notNA,ni2talevel_notNA,ni2tatrend_notNA,lqfintrend_notNA,liqfinlevel_notNA,sizelevel_notNA,sizetrend_notNA
0,26990,2023,7,0.110445,0.009928,4.874043,0.537756,0.000000,0.000000,0.003009,...,1,1,1,1,1,1,1,1,1,1
1,27030,2023,7,0.110445,0.009928,3.875769,0.519613,0.000000,0.000000,0.002722,...,1,1,1,1,1,1,1,1,1,1
2,27035,2023,7,0.110445,0.009928,2.130511,0.361442,0.000000,0.000000,0.000949,...,1,1,1,1,1,1,1,1,1,1
3,27037,2023,7,0.110445,0.009928,5.436649,-3.216218,0.000000,0.000000,-0.002387,...,1,1,1,1,1,1,1,1,1,1
4,27101,2023,7,0.110445,0.009928,2.708044,-1.008044,0.000000,0.000000,0.000757,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6666,53612,2021,3,0.564696,-0.010395,5.277041,0.981603,0.628831,-0.006415,0.002709,...,1,1,0,0,0,0,1,1,1,1
6667,32269,2001,7,-0.153477,0.002465,9.610933,0.943273,1.524022,-0.069022,0.015911,...,1,1,0,0,0,0,1,1,1,1
6668,27782,2015,1,0.111917,-0.011552,7.909717,0.790179,0.613869,0.259415,0.003405,...,1,1,0,0,0,0,1,1,1,1
6669,35143,2015,1,0.111917,-0.011552,3.983460,-0.330140,0.614278,0.027182,0.020096,...,1,1,0,0,0,0,1,1,1,1


In [67]:
df['Event_type'].value_counts()

Event_type
0    6596
1      75
Name: count, dtype: Int64

In [68]:
X = df.drop(['Event_type', 'yyyy', 'mm', 'EventDate'], axis=1)  # Features
y = df['Event_type']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [69]:
class_weights = {0:1, 1:10}
model = LGBMClassifier( class_weight = class_weights)
model.fit(X_train, y_train)
pred = model.predict(X_test)

[LightGBM] [Info] Number of positive: 63, number of negative: 5273
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3686
[LightGBM] [Info] Number of data points in the train set: 5336, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.106725 -> initscore=-2.124635
[LightGBM] [Info] Start training from score -2.124635


In [70]:
print('Training accuracy {:.4f}'.format(model.score(X_train,y_train)))
print('Testing accuracy {:.4f}'.format(model.score(X_test,y_test)))

Training accuracy 1.0000
Testing accuracy 0.9940


In [71]:
print(metrics.classification_report(y_test,model.predict(X_test)))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1323
         1.0       0.64      0.75      0.69        12

    accuracy                           0.99      1335
   macro avg       0.82      0.87      0.84      1335
weighted avg       0.99      0.99      0.99      1335

