In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

df = pd.read_csv('final_data.csv')
df_orig = df.copy() 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207300 entries, 0 to 207299
Data columns (total 59 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   SK_ID_CURR                   207300 non-null  int64  
 1   TARGET                       207300 non-null  int64  
 2   NAME_CONTRACT_TYPE           207300 non-null  object 
 3   FLAG_OWN_CAR                 207300 non-null  object 
 4   FLAG_OWN_REALTY              207300 non-null  object 
 5   CNT_CHILDREN                 207300 non-null  int64  
 6   AMT_INCOME_TOTAL             207300 non-null  float64
 7   AMT_ANNUITY                  207297 non-null  float64
 8   NAME_INCOME_TYPE             207300 non-null  object 
 9   NAME_EDUCATION_TYPE          207300 non-null  object 
 10  NAME_FAMILY_STATUS           207300 non-null  object 
 11  NAME_HOUSING_TYPE            207300 non-null  object 
 12  REGION_POPULATION_RELATIVE   207300 non-null  float64
 13 

In [3]:
#convert to numeric
cols_to_float = [
    'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'CNT_FAM_MEMBERS', 'EXT_SOURCE_1',
    'YEARS_BEGINEXPLUATATION_AVG', 'FLOORSMAX_AVG', 'LIVINGAREA_AVG',
    'LIVINGAREA_MEDI', 'TOTALAREA_MODE', 'DEF_30_CNT_SOCIAL_CIRCLE',
    'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE',
    'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_MON',
    'AMT_REQ_CREDIT_BUREAU_YEAR', 'DAYS_CREDIT_min',
    'DAYS_CREDIT_ENDDATE_mean', 'AMT_CREDIT_SUM_mean',
    'AMT_CREDIT_SUM_DEBT_mean'
]

for col in cols_to_float:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')  # Converts to float + NaN

# Replace 0 with NaN for selected columns (if 0 is considered "missing")

cols_where_0_is_missing = [
    'DAYS_CREDIT_min', 'DAYS_CREDIT_ENDDATE_mean',
    'AMT_CREDIT_SUM_mean', 'AMT_CREDIT_SUM_DEBT_mean'
]

for col in cols_where_0_is_missing:
    if col in df.columns:
        df[col] = df[col].replace(0, np.nan)

#convert to categorical
cols_to_object = [
    'TARGET',  # Treat as categorical for WOE
    'NAME_CONTRACT_TYPE',
    'FLAG_OWN_CAR',
    'FLAG_OWN_REALTY',
    'NAME_INCOME_TYPE',
    'NAME_EDUCATION_TYPE',
    'NAME_FAMILY_STATUS',
    'NAME_HOUSING_TYPE',
    'OCCUPATION_TYPE',
    'ORGANIZATION_TYPE',
    'FLAG_EMP_PHONE',         # binary (0/1)
    'FLAG_WORK_PHONE',        # binary
    'FLAG_CONT_MOBILE',       # binary
    'FLAG_EMAIL',             # binary
    'REG_REGION_NOT_WORK_REGION',  # binary
    'REG_CITY_NOT_LIVE_CITY',      # binary
    'REG_CITY_NOT_WORK_CITY',      # binary
]

# Convert only if column exists
binary_cols = [
    
    'FLAG_EMP_PHONE',
    'FLAG_WORK_PHONE',
    'FLAG_CONT_MOBILE',
    'FLAG_EMAIL',
    'REG_REGION_NOT_WORK_REGION',
    'REG_CITY_NOT_LIVE_CITY',
    'REG_CITY_NOT_WORK_CITY'
]

for col in binary_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).astype('object')
        
# Fill missing values for all categorical (object) columns with "Missing"
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    df[col] = df[col].fillna("Missing")


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207300 entries, 0 to 207299
Data columns (total 59 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   SK_ID_CURR                   207300 non-null  int64  
 1   TARGET                       207300 non-null  int64  
 2   NAME_CONTRACT_TYPE           207300 non-null  object 
 3   FLAG_OWN_CAR                 207300 non-null  object 
 4   FLAG_OWN_REALTY              207300 non-null  object 
 5   CNT_CHILDREN                 207300 non-null  int64  
 6   AMT_INCOME_TOTAL             207300 non-null  float64
 7   AMT_ANNUITY                  207297 non-null  float64
 8   NAME_INCOME_TYPE             207300 non-null  object 
 9   NAME_EDUCATION_TYPE          207300 non-null  object 
 10  NAME_FAMILY_STATUS           207300 non-null  object 
 11  NAME_HOUSING_TYPE            207300 non-null  object 
 12  REGION_POPULATION_RELATIVE   207300 non-null  float64
 13 

In [4]:
def calculate_iv_all(df, target_col='TARGET', bins=10):
    iv_summary = []

    features = [col for col in df.columns if col != target_col and col != 'SK_ID_CURR']

    for col in features:
        temp_df = df[[col, target_col]].copy()

        # Handle "Missing" placeholders
        temp_df[col] = temp_df[col].replace('Missing', np.nan)

        # Ensure target is numeric
        temp_df[target_col] = pd.to_numeric(temp_df[target_col], errors='coerce')

        # Bin numerical columns
        if df[col].dtype in ['int64', 'float64']:
            try:
                temp_df['Bin_Range'] = pd.qcut(temp_df[col], q=bins, duplicates='drop')
            except ValueError:
                print(f"Skipping {col} (not enough unique values to bin)")
                continue
        else:
            temp_df['Bin_Range'] = temp_df[col].astype(str)

        temp_df['Bin_Range'] = temp_df['Bin_Range'].astype(object).fillna('Missing')

        grouped = temp_df.groupby('Bin_Range')
        total_events = temp_df[target_col].sum()
        total_non_events = len(temp_df) - total_events

        iv_table = grouped.agg({
            target_col: ['count', 'sum']
        }).reset_index()

        iv_table.columns = ['Bin_Range', 'Count', 'Events']
        iv_table['Non_Events'] = iv_table['Count'] - iv_table['Events']
        iv_table['%_Events'] = iv_table['Events'] / total_events
        iv_table['%_Non_Events'] = iv_table['Non_Events'] / total_non_events

        # Avoid div by 0
        iv_table['WOE'] = np.log((iv_table['%_Non_Events'] + 1e-6) / (iv_table['%_Events'] + 1e-6))
        iv_table['IV'] = (iv_table['%_Non_Events'] - iv_table['%_Events']) * iv_table['WOE']

        total_iv = iv_table['IV'].sum()
        iv_summary.append({'Variable': col, 'IV': total_iv})

    iv_df = pd.DataFrame(iv_summary).sort_values(by='IV', ascending=False)
    return iv_df

In [5]:
iv_df = calculate_iv_all(df, target_col='TARGET', bins=20)
print(iv_df)

                       Variable        IV
25                 EXT_SOURCE_1  0.156003
38                LOAN_TO_VALUE  0.091847
37          RATIO_LIFE_EMPLOYED  0.088309
17              OCCUPATION_TYPE  0.080222
40              DAYS_CREDIT_min  0.078573
41     DAYS_CREDIT_ENDDATE_mean  0.075526
24            ORGANIZATION_TYPE  0.068933
42        DAYS_ENDDATE_FACT_min  0.066649
6              NAME_INCOME_TYPE  0.050930
33       DAYS_LAST_PHONE_CHANGE  0.048573
7           NAME_EDUCATION_TYPE  0.048123
10   REGION_POPULATION_RELATIVE  0.040241
27                FLOORSMAX_AVG  0.039137
12              DAYS_ID_PUBLISH  0.037855
30               TOTALAREA_MODE  0.036709
29              LIVINGAREA_MEDI  0.033961
28               LIVINGAREA_AVG  0.033893
55         CREDIT_ACTIVE_Closed  0.033640
26  YEARS_BEGINEXPLUATATION_AVG  0.030477
5                   AMT_ANNUITY  0.029797
53         CREDIT_ACTIVE_Active  0.028759
23       REG_CITY_NOT_WORK_CITY  0.028133
11            DAYS_REGISTRATION  0

In [16]:
important_vars = [
    'EXT_SOURCE_1',
    'LOAN_TO_VALUE',
    'RATIO_LIFE_EMPLOYED',
    'OCCUPATION_TYPE',
    'DAYS_CREDIT_min',
    'DAYS_CREDIT_ENDDATE_mean',
    'ORGANIZATION_TYPE',
    'DAYS_ENDDATE_FACT_min',
    'NAME_INCOME_TYPE',
    'DAYS_LAST_PHONE_CHANGE',
    'NAME_EDUCATION_TYPE',
    'REGION_POPULATION_RELATIVE',
    'FLOORSMAX_AVG',
    'DAYS_ID_PUBLISH',
    'TOTALAREA_MODE',
    'LIVINGAREA_MEDI',
    'TARGET'  # keeping target for supervised learning
]

df.drop(columns=[col for col in df.columns if col not in important_vars], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207300 entries, 0 to 207299
Data columns (total 17 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   TARGET                      207300 non-null  int64  
 1   NAME_INCOME_TYPE            207300 non-null  object 
 2   NAME_EDUCATION_TYPE         207300 non-null  object 
 3   REGION_POPULATION_RELATIVE  207300 non-null  float64
 4   DAYS_ID_PUBLISH             207300 non-null  int64  
 5   OCCUPATION_TYPE             207300 non-null  object 
 6   ORGANIZATION_TYPE           207300 non-null  object 
 7   EXT_SOURCE_1                89268 non-null   float64
 8   FLOORSMAX_AVG               105336 non-null  float64
 9   LIVINGAREA_MEDI             104406 non-null  float64
 10  TOTALAREA_MODE              108361 non-null  float64
 11  DAYS_LAST_PHONE_CHANGE      207299 non-null  float64
 12  RATIO_LIFE_EMPLOYED         207300 non-null  float64
 13  LOAN_TO_VALUE 

In [18]:
df.to_csv("after_coarse_classing.csv", index=False)