# Code to Over Sample the Data using ADASYN to Balance the classes

In [None]:
import pandas as pd

def handle_missing_values(df):
    missing_pct = (df.isnull().sum() / len(df)) * 100
    
    cols_to_drop = missing_pct[missing_pct > 70].index
    df_cleaned = df.drop(columns=cols_to_drop)

    numeric_cols = df_cleaned.select_dtypes(include=['float64', 'int64']).columns
    df_cleaned[numeric_cols] = df_cleaned[numeric_cols].fillna(df_cleaned[numeric_cols].median())
    
    return df_cleaned

df = pd.read_csv('path/to/your/data.csv')
df_cleaned = handle_missing_values(df)
X = df_cleaned.drop('bad_flag', axis=1)
y = df_cleaned['bad_flag']

print(f"Class distribution before ADASYN: {pd.Series(y).value_counts()}")

In [2]:
from imblearn.over_sampling import ADASYN
adasyn = ADASYN(sampling_strategy='minority', random_state=42)
X_res, y_res = adasyn.fit_resample(X, y)

print(f"Class distribution after SMOTE: {pd.Series(y_res).value_counts()}")


Class distribution after SMOTE: bad_flag
1    95958
0    95434
Name: count, dtype: int64


In [4]:
# Combine the resampled features and target into a single DataFrame
df_resampled = pd.concat([X_res, y_res], axis=1)
# Display basic statistics of the resampled DataFrame
df_resampled.describe()

Unnamed: 0,account_number,onus_attribute_1,transaction_attribute_1,transaction_attribute_2,transaction_attribute_3,transaction_attribute_4,transaction_attribute_5,transaction_attribute_6,transaction_attribute_7,transaction_attribute_8,...,bureau_enquiry_42,bureau_enquiry_43,bureau_enquiry_44,bureau_enquiry_45,bureau_enquiry_46,bureau_enquiry_47,bureau_enquiry_48,bureau_enquiry_49,bureau_enquiry_50,bad_flag
count,191392.0,191392.0,191392.0,191392.0,191392.0,191392.0,191392.0,191392.0,191392.0,191392.0,...,191392.0,191392.0,191392.0,191392.0,191392.0,191392.0,191392.0,191392.0,191392.0,191392.0
mean,48296.677562,119072.4,3.863388,0.001181,1.814808,31.649907,0.002837,27.811889,0.030304,7.8e-05,...,1.361336,18.693204,0.343431,5.173802,0.396779,0.0,0.27413,0.040564,9.525786,0.501369
std,25748.233316,118286.6,925.925105,0.074491,184.834121,1938.487229,0.081218,1542.483093,8.686007,0.020825,...,2.192439,16.281588,0.950287,6.797203,1.100456,0.0,0.648623,0.314437,8.791079,0.499999
min,1.0,25000.0,0.0,0.0,0.0,-109800.4766,0.0,-3498.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,27105.0,64000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,7.0,0.0,0.83979,0.0,0.0,0.0,0.0,3.0,0.0
50%,48376.5,100000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.435279,14.853675,0.0,3.0,0.0,0.0,0.0,0.0,7.0,1.0
75%,69506.25,120267.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,26.700795,0.0,7.321867,0.0,0.0,0.023574,0.0,13.999563,1.0
max,96806.0,2800000.0,398936.791,25.0,55000.0,358986.0,12.0,358986.0,3150.0,8.0,...,59.0,287.0,24.0,134.0,43.0,0.0,18.0,14.0,102.0,1.0


In [None]:
df_resampled.to_csv('path/to/new/csv', index=False)