# Bankruptcy Predictions with Gradient Boosting Ensemble Models

#### Import libraries


In [3]:
# Library for Data Handling
import pandas as pd

# Library for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Library for splitting data
from sklearn.model_selection import train_test_split

# Library for Resampling Training Data
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Library for creating Pipeline
from sklearn.pipeline import Pipeline, make_pipeline

# Library for Imputing Missing values
from sklearn.impute import SimpleImputer

# Library for Model training
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.sklearn.model_selection  import 

# Library for object Serialization
import pickle

In [9]:
# Read the cleaned csv to DataFrame
df = pd.read_csv('../data/cleaned_dataframe.csv', index_col=0)
print(df.shape)
df.head()

(43405, 66)


Unnamed: 0,year,A1,A2,A3,A4,A5,A6,A7,A8,A9,...,A56,A57,A58,A59,A60,A61,A62,A63,A64,bankrupt
0,1,0.20055,0.37951,0.39641,2.0472,32.351,0.38825,0.24976,1.3305,1.1389,...,0.12196,0.39718,0.87804,0.001924,8.416,5.1372,82.658,4.4158,7.4277,False
1,1,0.20912,0.49988,0.47225,1.9447,14.786,0.0,0.25834,0.99601,1.6996,...,0.1213,0.42002,0.853,0.0,4.1486,3.2732,107.35,3.4,60.987,False
2,1,0.24866,0.69592,0.26713,1.5548,-1.1523,0.0,0.30906,0.43695,1.309,...,0.24114,0.81774,0.76599,0.69484,4.9909,3.951,134.27,2.7185,5.2078,False
3,1,0.081483,0.30734,0.45879,2.4928,51.952,0.14988,0.092704,1.8661,1.0571,...,0.054015,0.14207,0.94598,0.0,4.5746,3.6147,86.435,4.2228,5.5497,False
4,1,0.18732,0.61323,0.2296,1.4063,-7.3128,0.18732,0.18732,0.6307,1.1559,...,0.13485,0.48431,0.86515,0.12444,6.3985,4.3158,127.21,2.8692,7.898,False


#### Train-test split

In [12]:
# Split the data into train and test set
target = 'bankrupt'
X = df.drop(target, axis=1)
y = df[target]

In [13]:
# Split the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (34724, 65)
y_train shape: (34724,)
X_test shape: (8681, 65)
y_test shape: (8681,)


#### Resample

While exploring the dataset, I noticed class imbalance on the target column, in order to balance the class distribution and remove bias towards the majority class, I'll perform Oversampling on the training set.

In [14]:
over_sampler = RandomOverSampler(random_state=42)
X_train_over, y_train_over = over_sampler.fit_resample(X_train, y_train)
print("X_train_over shape:", X_train_over.shape)
print("y_train_over shape:", y_train_over.shape)

X_train_over shape: (66148, 65)
y_train_over shape: (66148,)


#### Build Model

In [15]:
# Create an instance of the model
clf = GradientBoostingClassifier()