In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
df = pd.read_csv('../dataset/encoded_bankruptcy_data.csv')

# Display a quick look at the data
print(df.head())
print(df.info())

  company_name  Financial_Year  Bankruptcy_Status  Current_Assets  \
0          C_1          1999.0                  0        511267.0   
1          C_1          2000.0                  0        485856.0   
2          C_1          2001.0                  0        436656.0   
3          C_1          2002.0                  0        396412.0   
4          C_1          2003.0                  0        432204.0   

   Cost_of_Goods_Sold  Depreciation_Amortization    EBITDA  Inventory  \
0          740998.000                   833107.0  180447.0   18373.00   
1             701.854                   713811.0  179987.0   18577.00   
2          710199.000                   526477.0  217699.0   22496.00   
3             686.621                   496747.0  164658.0   27172.00   
4             709.292                   523302.0  248666.0      26.68   

   Net_Income  Total_Receivables  ...  Group_78  Group_79  Group_80  Group_81  \
0    70658.00          89031.000  ...     False     False     Fal

In [4]:
# Drop the 'company_name' column as it's just an identifier
df = df.drop(columns=['company_name'])

# Define features (X) and the target (y)
X = df.drop('Bankruptcy_Status', axis=1)
y = df['Bankruptcy_Status']

# Split the data into training and testing sets (80% train, 20% test)
# 'stratify=y' is important for imbalanced datasets to ensure both
# train and test sets have a similar proportion of bankrupt companies.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features
# This scales all features to have a mean of 0 and a standard deviation of 1
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
# 1. Train Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)

print("Random Forest model trained successfully.")

# 2. Train Gradient Boosting
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_scaled, y_train)

print("Gradient Boosting model trained successfully.")

Random Forest model trained successfully.
Gradient Boosting model trained successfully.


In [6]:
# Get predictions from both models
y_pred_rf = rf_model.predict(X_test_scaled)
y_pred_gb = gb_model.predict(X_test_scaled)

# --- Evaluate Random Forest ---
print("\n--- Random Forest Classifier Metrics ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_rf):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_rf):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred_rf):.4f}")

# --- Evaluate Gradient Boosting ---
print("\n--- Gradient Boosting Classifier Metrics ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_gb):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_gb):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_gb):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred_gb):.4f}")


--- Random Forest Classifier Metrics ---
Accuracy: 0.9375
Precision: 0.9000
Recall: 0.0749
F1-score: 0.1383

--- Gradient Boosting Classifier Metrics ---
Accuracy: 0.9354
Precision: 0.7083
Recall: 0.0606
F1-score: 0.1117


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE  # <-- The new import

# Load the dataset
df = pd.read_csv('encoded_bankruptcy_data.csv')

# Drop the 'company_name' column as it's just an identifier
df = df.drop(columns=['company_name'])

In [8]:
pip install imblearn


Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.14.0-py3-none-any.whl.metadata (8.8 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.14.0-py3-none-any.whl (239 kB)
Installing collected packages: imbalanced-learn, imblearn

   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ------------


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE  # <-- The new import

# Load the dataset
df = pd.read_csv('encoded_bankruptcy_data.csv')

# Drop the 'company_name' column as it's just an identifier
df = df.drop(columns=['company_name'])

In [12]:
# Define features (X) and the target (y)
X = df.drop('Bankruptcy_Status', axis=1)
y = df['Bankruptcy_Status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Original training set shape: {X_train_scaled.shape}")

Original training set shape: (50231, 102)


In [13]:
# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the training data
# This creates new synthetic samples for the minority class
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print(f"Resampled (SMOTE) training set shape: {X_train_resampled.shape}")
print(f"Original y_train distribution:\n{y_train.value_counts()}")
print(f"Resampled y_train distribution:\n{y_train_resampled.value_counts()}")

Resampled (SMOTE) training set shape: (93738, 102)
Original y_train distribution:
Bankruptcy_Status
0    46869
1     3362
Name: count, dtype: int64
Resampled y_train distribution:
Bankruptcy_Status
0    46869
1    46869
Name: count, dtype: int64


In [14]:
# 1. Train Random Forest on SMOTE data
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_resampled, y_train_resampled)
print("\nRandom Forest model trained successfully on SMOTE data.")

# 2. Train Gradient Boosting on SMOTE data
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_resampled, y_train_resampled)
print("Gradient Boosting model trained successfully on SMOTE data.")


Random Forest model trained successfully on SMOTE data.
Gradient Boosting model trained successfully on SMOTE data.


In [15]:
# Get predictions from both models
y_pred_rf = rf_model.predict(X_test_scaled)
y_pred_gb = gb_model.predict(X_test_scaled)

# --- Evaluate Random Forest (after SMOTE) ---
print("\n--- Random Forest (with SMOTE) ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_rf):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_rf):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred_rf):.4f}")

# --- Evaluate Gradient Boosting (after SMOTE) ---
print("\n--- Gradient Boosting (with SMOTE) ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_gb):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_gb):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_gb):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred_gb):.4f}")


--- Random Forest (with SMOTE) ---
Accuracy: 0.9154
Precision: 0.3328
Recall: 0.2616
F1-score: 0.2929

--- Gradient Boosting (with SMOTE) ---
Accuracy: 0.7919
Precision: 0.1550
Recall: 0.4732
F1-score: 0.2335
