In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import pickle


In [3]:
df = pd.read_csv('/kaggle/input/newconnect-market-corporate-default-prediction/dataset_csv.csv', sep='\t')


In [4]:
df

Unnamed: 0,obs_id,class,default,x1,x2,x3,x4,x5,x6,x7,...,x21,x22,x23,x24,x25,x26,time,year,testing_set,training_set
0,0,1406,0,0.548690,0.378213,0.516406,1.608842,0.602419,0.065173,0.558509,...,0.537593,0.001444,0.382384,0.033177,1.000000,0,1,2007,1,0
1,1,1406,0,0.547813,0.378383,0.516522,1.699176,0.601911,0.062662,0.557950,...,0.572315,0.001311,0.380498,0.028798,1.000000,0,2,2008,1,0
2,2,1406,0,0.549332,0.378818,0.514555,1.734131,0.603207,0.063617,0.559306,...,0.583454,0.001697,0.381134,0.030690,1.000000,0,3,2009,1,0
3,3,1406,0,0.552237,0.379319,0.512808,1.852136,0.605443,0.078868,0.565012,...,0.583256,0.001229,0.368001,0.062207,0.903143,0,4,2010,1,0
4,4,1406,0,0.550137,0.379200,0.515377,1.750650,0.603519,0.084289,0.560092,...,0.569079,0.001034,0.368143,0.073117,0.922740,0,5,2011,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4206,4206,54984,0,0.542723,0.373749,0.550308,0.392159,0.590448,0.069310,0.550825,...,0.597256,2.138156,1.115053,0.038556,0.943278,0,1,2015,1,0
4207,4207,54984,0,0.648290,0.384537,0.517096,0.406113,0.625153,0.078262,0.558577,...,0.512141,0.184817,0.365182,0.017708,0.965658,0,2,2016,1,0
4208,4208,54984,0,0.620559,0.393311,0.512871,0.408022,0.631175,0.085568,0.561392,...,0.584074,0.132024,0.365182,0.028232,0.945022,0,3,2017,1,0
4209,4209,55222,0,0.542723,0.403193,0.494717,0.381386,0.568214,0.110941,0.537031,...,0.567425,0.366094,0.375811,0.138021,1.000000,0,1,2015,0,1


In [4]:
column_mapping = {
    "x1": "Return on Equity",
    "x2": "Assets Ratio",
    "x3": "Rate Debt Security",
    "x4": "EBITDA",
    "x5": "Return on Assets",
    "x6": "Working Capital Ratio",
    "x7": "Net Profit / Current Liabilities",
    "x8": "(Net Profit + Depreciation) / Total Liabilities",
    "x9": "1 / Debt Ratio",
    "x10": "(Total Liabilities – Cash and Cash Equivalents) / Profit on Sales",
    "x11": "Receivables Turnover / Profit on Sales",
    "x12": "EBITDA / Profit on Sales",
    "x13": "(Cost of Sales + General and Administrative Costs + Operating Expenses) / Profit",
    "x14": "Salaries / Net Profit",
    "x15": "Foreign Service / Net Profit",
    "x16": "Foreign Service / Profit on Sales",
    "x17": "Balance Sheet Change in Cash",
    "x18": "Profit on Sales / Non-current Assets",
    "x19": "Cash and Cash Equivalents / Current Liabilities",
    "x20": "Non-current Liabilities / Equity Shareholders of the Parent",
    "x21": "Cash Flow from Investing Activities / Equity Shareholders of the Parent",
    "x22": "Intangible Assets / Share Capital",
    "x23": "Trade Payables / Equity Shareholders of the Parent",
    "x24": "(Current Assets – Inventories) / Current Liabilities",
    "x25": "Operating Expenses / Current Liabilities",
    "x26": "Polish Accounting Standards (1 - Yes, 0 - No)"
}

df.rename(columns=column_mapping, inplace=True)


In [10]:
df.head(50)

Unnamed: 0,obs_id,class,default,Return on Equity,Assets Ratio,Rate Debt Security,EBITDA,Return on Assets,Working Capital Ratio,Net Profit / Current Liabilities,...,Cash Flow from Investing Activities / Equity Shareholders of the Parent,Intangible Assets / Share Capital,Trade Payables / Equity Shareholders of the Parent,(Current Assets – Inventories) / Current Liabilities,Operating Expenses / Current Liabilities,"Polish Accounting Standards (1 - Yes, 0 - No)",time,year,testing_set,training_set
0,0,1406,0,0.54869,0.378213,0.516406,1.608842,0.602419,0.065173,0.558509,...,0.537593,0.001444,0.382384,0.033177,1.0,0,1,2007,1,0
1,1,1406,0,0.547813,0.378383,0.516522,1.699176,0.601911,0.062662,0.55795,...,0.572315,0.001311,0.380498,0.028798,1.0,0,2,2008,1,0
2,2,1406,0,0.549332,0.378818,0.514555,1.734131,0.603207,0.063617,0.559306,...,0.583454,0.001697,0.381134,0.03069,1.0,0,3,2009,1,0
3,3,1406,0,0.552237,0.379319,0.512808,1.852136,0.605443,0.078868,0.565012,...,0.583256,0.001229,0.368001,0.062207,0.903143,0,4,2010,1,0
4,4,1406,0,0.550137,0.3792,0.515377,1.75065,0.603519,0.084289,0.560092,...,0.569079,0.001034,0.368143,0.073117,0.92274,0,5,2011,1,0
5,5,1406,0,0.549124,0.378952,0.518033,1.72885,0.602687,0.087869,0.559899,...,0.561421,0.000874,0.367126,0.076073,0.911897,0,6,2012,1,0
6,6,1406,0,0.552528,0.379132,0.516231,1.950204,0.605176,0.089473,0.56421,...,0.570003,0.000829,0.368752,0.083709,0.909606,0,7,2013,1,0
7,7,1406,0,0.549654,0.379196,0.516934,1.832592,0.603282,0.099034,0.563838,...,0.576427,0.0011,0.36731,0.094833,0.871362,0,8,2014,1,0
8,8,1406,0,0.549135,0.379366,0.51626,1.807788,0.602996,0.100316,0.563515,...,0.580049,0.001086,0.367716,0.105076,0.8617,0,9,2015,1,0
9,9,1406,0,0.549027,0.379628,0.514962,1.799639,0.603081,0.065141,0.555303,...,0.583159,0.001143,0.373799,0.016522,0.960024,0,10,2016,1,0


In [5]:
import pandas as pd


# Filter rows where 'default' column has value 1
default_ones = df[df["default"] == 1]

# Display results
print(default_ones)

# Save filtered data to a new CSV (optional)
default_ones.to_csv("default_1_records.csv", index=False)


      obs_id  class  default  Return on Equity  Assets Ratio  \
36        36   2797        1          0.542723      0.370342   
40        40   3567        1          0.498116      0.370632   
46        46   4111        1          0.541216      0.373885   
60        60   5107        1          0.542723      0.355509   
73        73   7206        1          0.584670      0.384291   
...      ...    ...      ...               ...           ...   
4039    4039  46149        1          0.577616      0.399039   
4065    4065  46604        1          0.542723      0.369371   
4095    4095  48508        1          0.581610      0.407759   
4114    4114  49310        1          0.542723      0.366878   
4137    4137  50383        1          0.590292      0.376887   

      Rate Debt Security    EBITDA  Return on Assets  Working Capital Ratio  \
36              0.501332  0.388901          0.500874               0.052440   
40              0.313678  0.348045          0.578422               0.0525

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=15) 
X_pca = pca.fit_transform(X)

# Convert back to DataFrame
pca_columns = [f'PC{i+1}' for i in range(15)]
df_pca = pd.DataFrame(X_pca, columns=pca_columns)

# Add target variable back
df_pca['default'] = df['default']

# Explained variance ratio
print("Explained variance per principal component:", pca.explained_variance_ratio_)
print("Cumulative explained variance:", np.cumsum(pca.explained_variance_ratio_))

In [None]:
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
num_components = np.argmax(cumulative_variance >= 0.90) + 1
print(f"Number of components for 90% variance: {num_components}")


In [None]:
X = df.drop(columns=['default', 'training_set', 'testing_set'])  

y = df['default']

In [None]:
X

In [None]:
X_train = X[df['training_set'] == 1]
y_train = y[df['training_set'] == 1]
X_test = X[df['testing_set'] == 1]
y_test = y[df['testing_set'] == 1]

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Step 4: Hyperparameter Tuning with GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model selection
model = grid_search.best_estimator_

In [None]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))

In [None]:
feature_importances = model.feature_importances_
feature_names = X.columns
plt.figure(figsize=(10,6))
sns.barplot(x=feature_importances, y=feature_names)
plt.xlabel("Feature Importance")
plt.ylabel("Feature Name")
plt.title("Feature Importance in RandomForest Model")
plt.show()

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv('/kaggle/input/newconnect-market-corporate-default-prediction/dataset_csv.csv', sep='\t')

# Rename columns
column_mapping = { "x1": "Return on Equity", "x2": "Assets Ratio", "x3": "Rate Debt Security", "x4": "EBITDA",
                   "x5": "Return on Assets", "x6": "Working Capital Ratio", "x7": "Net Profit / Current Liabilities",
                   "x8": "(Net Profit + Depreciation) / Total Liabilities", "x9": "1 / Debt Ratio",
                   "x10": "(Total Liabilities – Cash and Cash Equivalents) / Profit on Sales",
                   "x11": "Receivables Turnover / Profit on Sales", "x12": "EBITDA / Profit on Sales",
                   "x13": "(Cost of Sales + General and Administrative Costs + Operating Expenses) / Profit",
                   "x14": "Salaries / Net Profit", "x15": "Foreign Service / Net Profit",
                   "x16": "Foreign Service / Profit on Sales", "x17": "Balance Sheet Change in Cash",
                   "x18": "Profit on Sales / Non-current Assets", "x19": "Cash and Cash Equivalents / Current Liabilities",
                   "x20": "Non-current Liabilities / Equity Shareholders of the Parent",
                   "x21": "Cash Flow from Investing Activities / Equity Shareholders of the Parent",
                   "x22": "Intangible Assets / Share Capital", "x23": "Trade Payables / Equity Shareholders of the Parent",
                   "x24": "(Current Assets – Inventories) / Current Liabilities",
                   "x25": "Operating Expenses / Current Liabilities",
                   "x26": "Polish Accounting Standards (1 - Yes, 0 - No)" }

df.rename(columns=column_mapping, inplace=True)

# Define numerical columns
num_cols = [ "Return on Equity", "Assets Ratio", "Rate Debt Security", "EBITDA", "Return on Assets",
             "Working Capital Ratio", "Net Profit / Current Liabilities",
             "(Net Profit + Depreciation) / Total Liabilities", "1 / Debt Ratio",
             "(Total Liabilities – Cash and Cash Equivalents) / Profit on Sales",
             "Receivables Turnover / Profit on Sales", "EBITDA / Profit on Sales",
             "(Cost of Sales + General and Administrative Costs + Operating Expenses) / Profit",
             "Salaries / Net Profit", "Foreign Service / Net Profit", "Foreign Service / Profit on Sales",
             "Balance Sheet Change in Cash", "Profit on Sales / Non-current Assets",
             "Cash and Cash Equivalents / Current Liabilities",
             "Non-current Liabilities / Equity Shareholders of the Parent",
             "Cash Flow from Investing Activities / Equity Shareholders of the Parent",
             "Intangible Assets / Share Capital", "Trade Payables / Equity Shareholders of the Parent",
             "(Current Assets – Inventories) / Current Liabilities",
             "Operating Expenses / Current Liabilities" ]

# Define target variable
X = df.drop(columns=['default', 'training_set', 'testing_set'])  
y = df['default']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Ensure X_train and X_test contain numerical columns
num_cols = [col for col in num_cols if col in X_train.columns]

# Handle NaN values (optional: drop or impute)
X_train[num_cols] = X_train[num_cols].fillna(X_train[num_cols].median())
X_test[num_cols] = X_test[num_cols].fillna(X_train[num_cols].median())  # Use training set median

# Standardize numerical features
scaler = StandardScaler()
X_train.loc[:, num_cols] = scaler.fit_transform(X_train[num_cols].copy())
X_test.loc[:, num_cols] = scaler.transform(X_test[num_cols].copy())



In [10]:
X_train

Unnamed: 0,obs_id,class,Return on Equity,Assets Ratio,Rate Debt Security,EBITDA,Return on Assets,Working Capital Ratio,Net Profit / Current Liabilities,(Net Profit + Depreciation) / Total Liabilities,...,Cash and Cash Equivalents / Current Liabilities,Non-current Liabilities / Equity Shareholders of the Parent,Cash Flow from Investing Activities / Equity Shareholders of the Parent,Intangible Assets / Share Capital,Trade Payables / Equity Shareholders of the Parent,(Current Assets – Inventories) / Current Liabilities,Operating Expenses / Current Liabilities,"Polish Accounting Standards (1 - Yes, 0 - No)",time,year
1266,1266,30017,0.220464,-0.083348,0.073396,0.633505,0.195507,-0.032682,0.020399,0.091067,...,-0.176315,-0.006631,0.061766,-0.028476,-0.001209,-0.182918,0.074695,0,4,2012
2520,2520,35925,-0.484719,-0.127758,0.499948,-0.261956,-0.003642,-0.045324,0.018804,0.050407,...,-0.178326,0.082005,0.112727,0.268260,-0.078140,-0.298979,0.069541,0,8,2016
2661,2661,36599,0.014707,-0.119479,-0.057919,-0.879372,-0.895064,-0.042279,0.011647,-0.262881,...,-0.165044,-0.064196,0.034101,-0.310885,0.021485,-0.222364,0.074695,0,1,2009
2506,2506,35924,0.014707,-0.104035,-0.050925,-0.493811,0.025715,0.008869,-0.057297,-3.757609,...,-0.162520,-0.185694,0.095926,-0.370965,-0.078827,0.162470,0.047229,0,1,2011
2519,2519,35925,-0.236269,-0.125686,0.867555,-0.253693,0.054103,-0.042297,0.018000,0.047798,...,-0.177626,0.556739,-0.203551,0.343499,-0.076462,-0.336172,0.064560,0,7,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
290,290,22895,0.182993,-0.078869,-0.291637,-0.267344,0.214348,0.128449,0.036826,0.916188,...,0.667478,-0.185694,0.154495,-0.369776,-0.078749,0.682030,-0.083197,0,5,2011
1545,1545,31914,0.014707,-0.179614,-1.950675,-0.311351,0.119587,-0.040258,0.019707,0.050113,...,-0.178368,-0.185694,0.112727,-0.370965,-0.079131,-0.322820,0.074492,0,8,2017
4074,4074,47814,0.110844,-0.124709,0.353030,-0.184941,0.142894,-0.038256,0.019875,0.055481,...,-0.178368,0.668589,0.030415,2.411371,-0.079131,-0.294970,0.074695,0,3,2016
2995,2995,38170,-0.092713,-0.095977,-0.187046,-0.292233,0.098633,-0.034960,0.019361,0.032988,...,-0.102220,-0.185694,-0.494896,-0.352875,-0.023729,-0.051755,0.074695,0,2,2011


In [3]:
pip install xgboost scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [5]:
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import numpy as np

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)


In [6]:
importances = rf_model.feature_importances_  # For RandomForest

feature_importance_dict = dict(zip(X_train.columns, importances))

sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

for feature, importance in sorted_features[:10]:
    print(f"{feature}: {importance:.4f}")


EBITDA: 0.0465
Rate Debt Security: 0.0444
Net Profit / Current Liabilities: 0.0443
1 / Debt Ratio: 0.0440
(Current Assets – Inventories) / Current Liabilities: 0.0435
Balance Sheet Change in Cash: 0.0406
Assets Ratio: 0.0405
Working Capital Ratio: 0.0401
(Total Liabilities – Cash and Cash Equivalents) / Profit on Sales: 0.0389
(Net Profit + Depreciation) / Total Liabilities: 0.0375


In [7]:
# Define threshold (e.g., keep features with importance >= 0.03)
threshold = 0.03
important_features = [feature for feature, importance in sorted_features if importance >= threshold]

# Filter dataset
X_train_selected = X_train[important_features]
X_test_selected = X_test[important_features]

print(f"Reduced from {X_train.shape[1]} to {X_train_selected.shape[1]} features.")


Reduced from 30 to 21 features.


In [7]:
rf_model.fit(X_train_selected, y_train)
new_accuracy = rf_model.score(X_test_selected, y_test)
print(f"New Model Accuracy: {new_accuracy:.4f}")


New Model Accuracy: 0.9609


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np

# Train Random Forest on the original dataset
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate the baseline model before feature selection
y_pred = rf_model.predict(X_test)
baseline_accuracy = accuracy_score(y_test, y_pred)
baseline_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])

print(f"Baseline Accuracy: {baseline_accuracy:.4f}")
print(f"Baseline AUC-ROC: {baseline_auc:.4f}")

# Get feature importance scores
importances = rf_model.feature_importances_
feature_importance_dict = dict(zip(X_train.columns, importances))

# Sort features by importance
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Print top 10 features
print("\nTop 10 Most Important Features:")
for feature, importance in sorted_features[:10]:
    print(f"{feature}: {importance:.4f}")

# Define threshold for feature selection
threshold = 0.03
important_features = [feature for feature, importance in sorted_features if importance >= threshold]

# Filter dataset
X_train_selected = X_train[important_features]
X_test_selected = X_test[important_features]

print(f"\nReduced features from {X_train.shape[1]} to {X_train_selected.shape[1]}.")

# Retrain the model with selected features
rf_model.fit(X_train_selected, y_train)

# Evaluate new model
y_pred_selected = rf_model.predict(X_test_selected)
new_accuracy = accuracy_score(y_test, y_pred_selected)
new_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test_selected)[:, 1])

print(f"\nNew Model Accuracy: {new_accuracy:.4f}")
print(f"New Model AUC-ROC: {new_auc:.4f}")

# Compare performance
improvement = new_auc - baseline_auc
if improvement > 0:
    print(f"\n✅ Model improved by {improvement:.4f} AUC points!")
else:
    print(f"\n⚠️ No improvement, consider adjusting the threshold or trying XGBoost.")


Baseline Accuracy: 0.9620
Baseline AUC-ROC: 0.7982

Top 10 Most Important Features:
EBITDA: 0.0465
Rate Debt Security: 0.0444
Net Profit / Current Liabilities: 0.0443
1 / Debt Ratio: 0.0440
(Current Assets – Inventories) / Current Liabilities: 0.0435
Balance Sheet Change in Cash: 0.0406
Assets Ratio: 0.0405
Working Capital Ratio: 0.0401
(Total Liabilities – Cash and Cash Equivalents) / Profit on Sales: 0.0389
(Net Profit + Depreciation) / Total Liabilities: 0.0375

Reduced features from 30 to 21.

New Model Accuracy: 0.9609
New Model AUC-ROC: 0.7747

⚠️ No improvement, consider adjusting the threshold or trying XGBoost.


In [12]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

# Get feature importance
xgb_importances = xgb_model.feature_importances_
xgb_feature_importance_dict = dict(zip(X_train.columns, xgb_importances))

# Sort and display top 10
sorted_xgb_features = sorted(xgb_feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
print("\nXGBoost Top 10 Features:")
for feature, importance in sorted_xgb_features[:10]:
    print(f"{feature}: {importance:.4f}")



XGBoost Top 10 Features:
Assets Ratio: 0.0885
time: 0.0862
Polish Accounting Standards (1 - Yes, 0 - No): 0.0604
Net Profit / Current Liabilities: 0.0502
Cash Flow from Investing Activities / Equity Shareholders of the Parent: 0.0479
(Cost of Sales + General and Administrative Costs + Operating Expenses) / Profit: 0.0404
year: 0.0374
EBITDA: 0.0363
Foreign Service / Profit on Sales: 0.0361
Cash and Cash Equivalents / Current Liabilities: 0.0347


In [16]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train_resampled, y_train_resampled)

# Get feature importance
xgb_importances = xgb_model.feature_importances_
xgb_feature_importance_dict = dict(zip(X_train.columns, xgb_importances))

# Sort and display top 10
sorted_xgb_features = sorted(xgb_feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
print("\nXGBoost Top 10 Features:")
for feature, importance in sorted_xgb_features[:10]:
    print(f"{feature}: {importance:.4f}")



XGBoost Top 10 Features:
Foreign Service / Net Profit: 0.1893
Return on Assets: 0.0792
time: 0.0525
Operating Expenses / Current Liabilities: 0.0506
year: 0.0453
Assets Ratio: 0.0438
Foreign Service / Profit on Sales: 0.0406
Net Profit / Current Liabilities: 0.0388
Rate Debt Security: 0.0369
Cash Flow from Investing Activities / Equity Shareholders of the Parent: 0.0350


In [23]:
# Define a threshold for important features
threshold = 0.03
important_xgb_features = [feature for feature, importance in sorted_xgb_features if importance >= threshold]

# Print the number of selected features
print(f"\nSelected {len(important_xgb_features)} features from {X_train_resampled.shape[1]}.")



Selected 12 features from 30.


In [24]:
print(important_xgb_features)

['Foreign Service / Net Profit', 'Return on Assets', 'time', 'Operating Expenses / Current Liabilities', 'year', 'Assets Ratio', 'Foreign Service / Profit on Sales', 'Net Profit / Current Liabilities', 'Rate Debt Security', 'Cash Flow from Investing Activities / Equity Shareholders of the Parent', 'Trade Payables / Equity Shareholders of the Parent', '(Cost of Sales + General and Administrative Costs + Operating Expenses) / Profit']


In [25]:
X_train_selected_xgb = X_train_resampled[important_xgb_features]
X_test_selected_xgb = X_test[important_xgb_features]


In [26]:
xgb_model.fit(X_train_selected_xgb, y_train_resampled)

# Evaluate new model
y_pred_selected_xgb = xgb_model.predict(X_test_selected_xgb)

from sklearn.metrics import accuracy_score, roc_auc_score

new_accuracy_xgb = accuracy_score(y_test, y_pred_selected_xgb)
new_auc_xgb = roc_auc_score(y_test, xgb_model.predict_proba(X_test_selected_xgb)[:, 1])

print(f"\nNew Model Accuracy (XGBoost): {new_accuracy_xgb:.4f}")
print(f"New Model AUC-ROC (XGBoost): {new_auc_xgb:.4f}")



New Model Accuracy (XGBoost): 0.9276
New Model AUC-ROC (XGBoost): 0.7477


In [None]:
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

In [None]:
import pickle

# Save model
with open("rf_model.pkl", "wb") as f:
    pickle.dump(y_pred_selected_xgb, f)



In [22]:
import pickle

with open("xgb_model_new.pkl", "wb") as f:
    pickle.dump(xgb_model, f)


In [27]:
from sklearn.preprocessing import StandardScaler

# Retrain the scaler on selected features
scaler_new = StandardScaler()
scaler_new.fit(X_train_selected_xgb)

# Save the new scaler
import pickle
with open("scaler_news.pkl", "wb") as f:
    pickle.dump(scaler_new, f)


In [7]:
from imblearn.over_sampling import SMOTE

# Separate features (X) and target variable (y)
X = df.drop(columns=["default"])  # Features
y = df["default"]  # Target

# Apply SMOTE
smote = SMOTE(sampling_strategy="auto", random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [11]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# Check original class distribution
print("Original class distribution:", Counter(y_train))

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check new class distribution
print("Resampled class distribution:", Counter(y_train_resampled))


Original class distribution: Counter({0: 3234, 1: 134})
Resampled class distribution: Counter({0: 3234, 1: 3234})
