In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [3]:
data = pd.read_csv('datasets/american_bankruptcy.csv')
data.head()

Unnamed: 0,company_name,status_label,year,X1,X2,X3,X4,X5,X6,X7,...,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18
0,C_1,alive,1999,511.267,833.107,18.373,89.031,336.018,35.163,128.348,...,1024.333,740.998,180.447,70.658,191.226,163.816,201.026,1024.333,401.483,935.302
1,C_1,alive,2000,485.856,713.811,18.577,64.367,320.59,18.531,115.187,...,874.255,701.854,179.987,45.79,160.444,125.392,204.065,874.255,361.642,809.888
2,C_1,alive,2001,436.656,526.477,22.496,27.207,286.588,-58.939,77.528,...,638.721,710.199,217.699,4.711,112.244,150.464,139.603,638.721,399.964,611.514
3,C_1,alive,2002,396.412,496.747,27.172,30.745,259.954,-12.41,66.322,...,606.337,686.621,164.658,3.573,109.59,203.575,124.106,606.337,391.633,575.592
4,C_1,alive,2003,432.204,523.302,26.68,47.491,247.245,3.504,104.661,...,651.958,709.292,248.666,20.811,128.656,131.261,131.884,651.958,407.608,604.467


In [4]:
# Data preprocessing
X = data.drop(['status_label','company_name', "year"], axis=1)  # Features
y = data['status_label']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(X_train_scaled.shape)

(62945, 18)


In [5]:
model = LGBMClassifier()
model.fit(X_train_scaled, y_train)
 
pred = model.predict(X_test_scaled)
print(pred)

[LightGBM] [Info] Number of positive: 4152, number of negative: 58793
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4590
[LightGBM] [Info] Number of data points in the train set: 62945, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.065962 -> initscore=-2.650433
[LightGBM] [Info] Start training from score -2.650433
['alive' 'alive' 'alive' ... 'alive' 'alive' 'alive']


In [6]:
print('Training accuracy {:.4f}'.format(model.score(X_train_scaled,y_train)))
print('Testing accuracy {:.4f}'.format(model.score(X_test_scaled,y_test)))

Training accuracy 0.9421
Testing accuracy 0.9348


In [7]:
print(metrics.classification_report(y_test,model.predict(X_test_scaled)))

              precision    recall  f1-score   support

       alive       0.93      1.00      0.97     14669
      failed       0.89      0.04      0.09      1068

    accuracy                           0.93     15737
   macro avg       0.91      0.52      0.53     15737
weighted avg       0.93      0.93      0.91     15737



<font size="6">Testing LightGBM model with filled_table</font>

In [8]:
from google.oauth2 import service_account
import pandas as pd
import pandas_gbq as gbq
credentials_path = 'data_cleaning/token.json'

# Authenticate with your credentials
credentials = service_account.Credentials.from_service_account_file(
    credentials_path, scopes=['https://www.googleapis.com/auth/bigquery'])

# Set the credentials for pandas_gbq

In [9]:
project_id = 'capstone-398012'
dataset_id = 'capstone_final'
table_id = "CRI_Compustat_Merged_785k"

In [13]:
from pandas_gbq import read_gbq

query = f"""
SELECT *
FROM `{project_id}.{dataset_id}.{table_id}`

"""

# Authenticate and read data from BigQuery into a DataFrame
df = read_gbq(query, project_id=project_id, dialect='standard')
df

  record_batch = self.to_arrow(


Unnamed: 0,CompanyNumber,yyyy,mm,DTDmedianFin,DTDmedianNonFin,dummy297fin,EventDate,EventDate_string,Duration,StartDate,...,equity_ratio,financial_leverage_ratio,cashflow_to_debt_ratio,net_profit_margin,asset_turnover,receivables_turnover,day_sales_outstanding,working_capital_turnover,price_to_earnings,retention_ratio
0,26981,2000.0,4,0.0,2.318686,0.0,2000-04-01 00:00:00+00:00,2000 04,4471 days 00:00:00,1988-01-04 00:00:00+00:00,...,1.036319,0.964954,1.428152,0.439745,0.341703,1.211964,0.825107,0.631578,20.567474,0.045750
1,26991,2000.0,4,0.0,2.318686,0.0,2000-04-01 00:00:00+00:00,2000 04,1989 days 00:00:00,1994-10-21 00:00:00+00:00,...,0.694259,1.440385,0.022451,0.887233,1.000000,0.016838,14.188158,0.577452,8.870968,30.749364
2,27098,2000.0,7,0.0,2.190358,0.0,2000-07-01 00:00:00+00:00,2000 07,4562 days 00:00:00,1988-01-04 00:00:00+00:00,...,2.130575,0.469357,0.149526,0.178122,0.506989,1.865118,0.536159,1.353551,8.990610,29.404471
3,27135,2000.0,7,0.0,2.190358,0.0,2000-07-01 00:00:00+00:00,2000 07,4562 days 00:00:00,1988-01-04 00:00:00+00:00,...,4.708123,0.212399,0.076983,0.583111,0.504901,2.207284,0.453045,-2.608228,22.918033,31.628731
4,27161,2000.0,8,0.0,2.235237,0.0,2000-08-01 00:00:00+00:00,2000 08,545 days 00:00:00,1999-02-03 00:00:00+00:00,...,0.631819,1.582732,-0.164773,0.317312,0.280725,0.956828,1.045120,0.819753,33.444444,-0.478495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
579220,27265,2017.0,10,0.0,4.771634,0.0,2017-10-01 00:00:00+00:00,2017 10,10863 days 00:00:00,1988-01-04 00:00:00+00:00,...,1.491574,0.670433,0.104983,0.488716,0.539965,1.101520,0.975082,1.903578,22.500000,8.906725
579221,47281,2018.0,7,0.0,4.549778,0.0,2018-07-01 00:00:00+00:00,2018 07,4989 days 00:00:00,2004-11-02 00:00:00+00:00,...,1.491574,0.670433,0.104983,0.324535,0.472193,0.885331,1.129522,1.529973,22.500000,1.810938
579222,27405,2020.0,8,0.0,2.695814,0.0,2020-08-01 00:00:00+00:00,2020 08,11898 days 00:00:00,1988-01-04 00:00:00+00:00,...,1.491574,0.670433,0.104983,0.488716,0.539965,1.101520,0.975082,1.903578,10.663305,23.307474
579223,27469,2022.0,10,0.0,2.820494,0.0,2022-10-01 00:00:00+00:00,2022 10,12689 days 00:00:00,1988-01-04 00:00:00+00:00,...,1.491574,0.670433,0.104983,0.488716,0.539965,1.101520,0.975082,1.903578,22.500000,8.906725


In [29]:
test_df = df

In [16]:
test_df.isna().sum().sort_values(ascending=False)

CompanyNumber      0
dltry              0
dpq                0
saleq              0
saley              0
                  ..
revtq              0
cogsq              0
oancfy             0
ivncfy             0
retention_ratio    0
Length: 65, dtype: int64

In [28]:
test_df['EventType'].value_counts()

EventType
0.0    574761
1.0      4464
Name: count, dtype: int64

In [18]:
test_df['EventType'] = test_df['EventType'].map(lambda x : 0 if x ==2  else x)

In [30]:
test_df.dtypes

CompanyNumber                 Int64
yyyy                        float64
mm                           object
DTDmedianFin                float64
DTDmedianNonFin             float64
                             ...   
receivables_turnover        float64
day_sales_outstanding       float64
working_capital_turnover    float64
price_to_earnings           float64
retention_ratio             float64
Length: 65, dtype: object

In [31]:
test_df.select_dtypes(include=['object'])

Unnamed: 0,mm,EventDate_string,Duration,gvkey,sic,sector_industry
0,4,2000 04,4471 days 00:00:00,12570,3845.0,38
1,4,2000 04,1989 days 00:00:00,24965,2870.0,28
2,7,2000 07,4562 days 00:00:00,1995,4931.0,49
3,7,2000 07,4562 days 00:00:00,13431,1311.0,13
4,8,2000 08,545 days 00:00:00,25767,5331.0,53
...,...,...,...,...,...,...
579220,10,2017 10,10863 days 00:00:00,3439,4931.0,49
579221,7,2018 07,4989 days 00:00:00,8001,4931.0,49
579222,8,2020 08,11898 days 00:00:00,7366,4911.0,49
579223,10,2022 10,12689 days 00:00:00,4517,4911.0,49


In [34]:
test_df[['gvkey', 'sic', 'sector_industry']] = test_df[['gvkey', 'sic', 'sector_industry']].astype(float)

In [43]:
X = test_df.drop(['EventType', 'EventDate_string', 'yyyy', 'mm', 'Duration', 'EventDate', 'StartDate', 'datadate'], axis=1)  # Features
y = test_df['EventType']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [47]:
class_weights = {0:1, 1:100}
model = LGBMClassifier( class_weight = class_weights)
model.fit(X_train, y_train)
pred = model.predict(X_test)

[LightGBM] [Info] Number of positive: 3547, number of negative: 459833
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14058
[LightGBM] [Info] Number of data points in the train set: 463380, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.435464 -> initscore=-0.259591
[LightGBM] [Info] Start training from score -0.259591


In [48]:
print('Training accuracy {:.4f}'.format(model.score(X_train,y_train)))
print('Testing accuracy {:.4f}'.format(model.score(X_test,y_test)))

Training accuracy 0.9116
Testing accuracy 0.9102


In [49]:
print(metrics.classification_report(y_test,model.predict(X_test)))

              precision    recall  f1-score   support

         0.0       1.00      0.91      0.95    114928
         1.0       0.07      0.82      0.13       917

    accuracy                           0.91    115845
   macro avg       0.53      0.87      0.54    115845
weighted avg       0.99      0.91      0.95    115845

