In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [11]:
data = pd.read_csv('datasets/american_bankruptcy.csv')
data.head()

Unnamed: 0,company_name,status_label,year,X1,X2,X3,X4,X5,X6,X7,...,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18
0,C_1,alive,1999,511.267,833.107,18.373,89.031,336.018,35.163,128.348,...,1024.333,740.998,180.447,70.658,191.226,163.816,201.026,1024.333,401.483,935.302
1,C_1,alive,2000,485.856,713.811,18.577,64.367,320.59,18.531,115.187,...,874.255,701.854,179.987,45.79,160.444,125.392,204.065,874.255,361.642,809.888
2,C_1,alive,2001,436.656,526.477,22.496,27.207,286.588,-58.939,77.528,...,638.721,710.199,217.699,4.711,112.244,150.464,139.603,638.721,399.964,611.514
3,C_1,alive,2002,396.412,496.747,27.172,30.745,259.954,-12.41,66.322,...,606.337,686.621,164.658,3.573,109.59,203.575,124.106,606.337,391.633,575.592
4,C_1,alive,2003,432.204,523.302,26.68,47.491,247.245,3.504,104.661,...,651.958,709.292,248.666,20.811,128.656,131.261,131.884,651.958,407.608,604.467


In [12]:
# Data preprocessing
X = data.drop(['status_label','company_name', "year"], axis=1)  # Features
y = data['status_label']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(X_train_scaled.shape)

(62945, 18)


In [13]:
model = LGBMClassifier()
model.fit(X_train_scaled, y_train)
 
pred = model.predict(X_test_scaled)
print(pred)

[LightGBM] [Info] Number of positive: 4152, number of negative: 58793
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4590
[LightGBM] [Info] Number of data points in the train set: 62945, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.065962 -> initscore=-2.650433
[LightGBM] [Info] Start training from score -2.650433
['alive' 'alive' 'alive' ... 'alive' 'alive' 'alive']


In [14]:
print('Training accuracy {:.4f}'.format(model.score(X_train_scaled,y_train)))
print('Testing accuracy {:.4f}'.format(model.score(X_test_scaled,y_test)))

Training accuracy 0.9421
Testing accuracy 0.9348


In [15]:
print(metrics.classification_report(y_test,model.predict(X_test_scaled)))

              precision    recall  f1-score   support

       alive       0.93      1.00      0.97     14669
      failed       0.89      0.04      0.09      1068

    accuracy                           0.93     15737
   macro avg       0.91      0.52      0.53     15737
weighted avg       0.93      0.93      0.91     15737



<font size="6">Testing LightGBM model with filled_table</font>

In [17]:
from google.oauth2 import service_account
import pandas as pd
import pandas_gbq as gbq
credentials_path = 'data_cleaning/token.json'

# Authenticate with your credentials
credentials = service_account.Credentials.from_service_account_file(
    credentials_path, scopes=['https://www.googleapis.com/auth/bigquery'])

# Set the credentials for pandas_gbq
gbq.context.credentials = credentials

In [18]:
project_id = 'capstone-398012'
dataset_id = 'capstone'
table_id = "filled_table"

In [19]:
from pandas_gbq import read_gbq

query = f"""
SELECT *
FROM `{project_id}.{dataset_id}.{table_id}`

"""

# Authenticate and read data from BigQuery into a DataFrame
df = read_gbq(query, project_id=project_id, dialect='standard')
df

  record_batch = self.to_arrow(


Unnamed: 0,CompNo,yyyy,mm,StkIndx,STInt,dtdlevel,dtdtrend,liqnonfinlevel,liqnonfintrend,ni2talevel,...,nan_count,liqnonfinlevel_notNA,liqnonfintrend_notNA,dtdlevel_notNA,dtdtrend_notNA,DTDmedianNonFin_notNA,Sector_Number_notNA,DTDmedianFin_notNA,sigma_notNA,StkIndx_notNA
0,26995,1990,1,0.106263,0.020305,3.176331,-0.148469,0.280325,-0.146216,0.002395,...,3,1,1,0,0,0,1,1,1,1
1,26996,1990,1,0.106263,0.020305,4.401022,0.054801,0.951410,0.033574,0.002635,...,3,1,1,0,0,0,1,1,1,1
2,26996,1990,1,0.106263,0.020305,4.401022,0.054801,0.951410,0.033574,0.002635,...,3,1,1,0,0,0,1,1,1,1
3,27000,1990,1,0.106263,0.020305,2.908823,0.056226,-0.312616,-0.149733,0.004073,...,3,1,1,0,0,0,1,1,1,1
4,27000,1990,1,0.106263,0.020305,2.908823,0.056226,-0.312616,-0.149733,0.004073,...,3,1,1,0,0,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12334,27058,2023,7,0.110445,0.009928,6.832332,2.326057,0.673112,-0.089241,0.006030,...,0,1,1,1,1,1,1,1,1,1
12335,27058,2023,7,0.110445,0.009928,6.832332,2.326057,0.673112,-0.089241,0.006030,...,0,1,1,1,1,1,1,1,1,1
12336,27058,2023,7,0.110445,0.009928,6.832332,2.326057,0.673112,-0.089241,0.006030,...,0,1,1,1,1,1,1,1,1,1
12337,27058,2023,7,0.110445,0.009928,6.832332,2.326057,0.673112,-0.089241,0.006030,...,0,1,1,1,1,1,1,1,1,1


In [20]:
#drop na rows
test_df = df.dropna()
test_df

Unnamed: 0,CompNo,yyyy,mm,StkIndx,STInt,dtdlevel,dtdtrend,liqnonfinlevel,liqnonfintrend,ni2talevel,...,nan_count,liqnonfinlevel_notNA,liqnonfintrend_notNA,dtdlevel_notNA,dtdtrend_notNA,DTDmedianNonFin_notNA,Sector_Number_notNA,DTDmedianFin_notNA,sigma_notNA,StkIndx_notNA
0,26995,1990,1,0.106263,0.020305,3.176331,-0.148469,0.280325,-0.146216,0.002395,...,3,1,1,0,0,0,1,1,1,1
1,26996,1990,1,0.106263,0.020305,4.401022,0.054801,0.951410,0.033574,0.002635,...,3,1,1,0,0,0,1,1,1,1
2,26996,1990,1,0.106263,0.020305,4.401022,0.054801,0.951410,0.033574,0.002635,...,3,1,1,0,0,0,1,1,1,1
3,27000,1990,1,0.106263,0.020305,2.908823,0.056226,-0.312616,-0.149733,0.004073,...,3,1,1,0,0,0,1,1,1,1
4,27000,1990,1,0.106263,0.020305,2.908823,0.056226,-0.312616,-0.149733,0.004073,...,3,1,1,0,0,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12334,27058,2023,7,0.110445,0.009928,6.832332,2.326057,0.673112,-0.089241,0.006030,...,0,1,1,1,1,1,1,1,1,1
12335,27058,2023,7,0.110445,0.009928,6.832332,2.326057,0.673112,-0.089241,0.006030,...,0,1,1,1,1,1,1,1,1,1
12336,27058,2023,7,0.110445,0.009928,6.832332,2.326057,0.673112,-0.089241,0.006030,...,0,1,1,1,1,1,1,1,1,1
12337,27058,2023,7,0.110445,0.009928,6.832332,2.326057,0.673112,-0.089241,0.006030,...,0,1,1,1,1,1,1,1,1,1


In [21]:
#map event type
test_df['Event_type'] = test_df['Event_type'].map(lambda x : 0 if x ==2  else x)
test_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['Event_type'] = test_df['Event_type'].map(lambda x : 0 if x ==2  else x)


Unnamed: 0,CompNo,yyyy,mm,StkIndx,STInt,dtdlevel,dtdtrend,liqnonfinlevel,liqnonfintrend,ni2talevel,...,nan_count,liqnonfinlevel_notNA,liqnonfintrend_notNA,dtdlevel_notNA,dtdtrend_notNA,DTDmedianNonFin_notNA,Sector_Number_notNA,DTDmedianFin_notNA,sigma_notNA,StkIndx_notNA
0,26995,1990,1,0.106263,0.020305,3.176331,-0.148469,0.280325,-0.146216,0.002395,...,3,1,1,0,0,0,1,1,1,1
1,26996,1990,1,0.106263,0.020305,4.401022,0.054801,0.951410,0.033574,0.002635,...,3,1,1,0,0,0,1,1,1,1
2,26996,1990,1,0.106263,0.020305,4.401022,0.054801,0.951410,0.033574,0.002635,...,3,1,1,0,0,0,1,1,1,1
3,27000,1990,1,0.106263,0.020305,2.908823,0.056226,-0.312616,-0.149733,0.004073,...,3,1,1,0,0,0,1,1,1,1
4,27000,1990,1,0.106263,0.020305,2.908823,0.056226,-0.312616,-0.149733,0.004073,...,3,1,1,0,0,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12334,27058,2023,7,0.110445,0.009928,6.832332,2.326057,0.673112,-0.089241,0.006030,...,0,1,1,1,1,1,1,1,1,1
12335,27058,2023,7,0.110445,0.009928,6.832332,2.326057,0.673112,-0.089241,0.006030,...,0,1,1,1,1,1,1,1,1,1
12336,27058,2023,7,0.110445,0.009928,6.832332,2.326057,0.673112,-0.089241,0.006030,...,0,1,1,1,1,1,1,1,1,1
12337,27058,2023,7,0.110445,0.009928,6.832332,2.326057,0.673112,-0.089241,0.006030,...,0,1,1,1,1,1,1,1,1,1


In [22]:
X = test_df.drop(['Event_type', 'yyyy', 'mm', 'EventDate'], axis=1)  # Features
y = test_df['Event_type']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
class_weights = {0:1, 1:100}
model = LGBMClassifier( class_weight = class_weights)
model.fit(X_train, y_train)
pred = model.predict(X_test)

[LightGBM] [Info] Number of positive: 727, number of negative: 9015
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4108
[LightGBM] [Info] Number of data points in the train set: 9742, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.889678 -> initscore=2.087452
[LightGBM] [Info] Start training from score 2.087452


In [32]:
print('Training accuracy {:.4f}'.format(model.score(X_train,y_train)))
print('Testing accuracy {:.4f}'.format(model.score(X_test,y_test)))

Training accuracy 0.7540
Testing accuracy 0.6773


In [33]:
print(metrics.classification_report(y_test,model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.96      0.68      0.80      2254
           1       0.14      0.67      0.24       182

    accuracy                           0.68      2436
   macro avg       0.55      0.67      0.52      2436
weighted avg       0.90      0.68      0.75      2436



push updated table to gbq, ie the table used for modeling

In [34]:
new_df = df.dropna()
new_df = new_df.drop(['yyyy', 'mm', 'EventDate'], axis=1,)
new_df['Event_type'] = new_df['Event_type'].map(lambda x : 0 if x ==2  else x)
new_df

Unnamed: 0,CompNo,StkIndx,STInt,dtdlevel,dtdtrend,liqnonfinlevel,liqnonfintrend,ni2talevel,ni2tatrend,sizelevel,...,nan_count,liqnonfinlevel_notNA,liqnonfintrend_notNA,dtdlevel_notNA,dtdtrend_notNA,DTDmedianNonFin_notNA,Sector_Number_notNA,DTDmedianFin_notNA,sigma_notNA,StkIndx_notNA
0,26995,0.106263,0.020305,3.176331,-0.148469,0.280325,-0.146216,0.002395,0.001367,0.666644,...,3,1,1,0,0,0,1,1,1,1
1,26996,0.106263,0.020305,4.401022,0.054801,0.951410,0.033574,0.002635,-0.000224,0.014110,...,3,1,1,0,0,0,1,1,1,1
2,26996,0.106263,0.020305,4.401022,0.054801,0.951410,0.033574,0.002635,-0.000224,0.014110,...,3,1,1,0,0,0,1,1,1,1
3,27000,0.106263,0.020305,2.908823,0.056226,-0.312616,-0.149733,0.004073,0.004625,1.248517,...,3,1,1,0,0,0,1,1,1,1
4,27000,0.106263,0.020305,2.908823,0.056226,-0.312616,-0.149733,0.004073,0.004625,1.248517,...,3,1,1,0,0,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12334,27058,0.110445,0.009928,6.832332,2.326057,0.673112,-0.089241,0.006030,0.000547,5.101596,...,0,1,1,1,1,1,1,1,1,1
12335,27058,0.110445,0.009928,6.832332,2.326057,0.673112,-0.089241,0.006030,0.000547,5.101596,...,0,1,1,1,1,1,1,1,1,1
12336,27058,0.110445,0.009928,6.832332,2.326057,0.673112,-0.089241,0.006030,0.000547,5.101596,...,0,1,1,1,1,1,1,1,1,1
12337,27058,0.110445,0.009928,6.832332,2.326057,0.673112,-0.089241,0.006030,0.000547,5.101596,...,0,1,1,1,1,1,1,1,1,1


In [37]:
new_df.to_gbq(destination_table=f'{project_id}.{dataset_id}.filled_table', project_id=project_id, if_exists='replace')