In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
import re
import numpy as np
import pandas as pd
import scipy.stats as stats         # for zscore
from scipy.stats import ttest_ind
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
import lightgbm as lgb
import warnings
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# Suppress just the catastrophic cancellation warnings from scipy
warnings.filterwarnings(
    "ignore"
)
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
df_train = pd.read_csv('/kaggle/input/sbi-data/HACKATHON_TRAINING_DATA/HACKATHON_TRAINING_DATA.CSV')
df_test = pd.read_csv('/kaggle/input/sbi-data/HACKATHON_TRAINING_DATA/HACKATHON_PREDICTION_DATA.CSV')

In [3]:
X = df_train.drop(columns=['TARGET'])
y = df_train['TARGET']

X_train, X_hold, y_train, y_hold = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [4]:
# === 2) Identify MCAR columns via t-test ===
num_cols = X_train.select_dtypes(include='number').columns

def is_mcar(df, col, alpha=0.05):
    mask = df[col].isnull()
    p_vals = []
    for feat in num_cols.drop(col, errors='ignore'):
        grp1 = df.loc[mask, feat].dropna()
        grp2 = df.loc[~mask, feat].dropna()
        if len(grp1) < 10 or len(grp2) < 10:
            continue
        _, p = ttest_ind(grp1, grp2, equal_var=False)
        p_vals.append(p)
    if not p_vals:
        return True
    signif = sum(p < alpha for p in p_vals)
    return (signif / len(p_vals)) < alpha

results = {}
for col in X_train.columns[X_train.isnull().any()]:
    if col in num_cols:
        results[col] = is_mcar(X_train, col)

mcar_cols = [c for c,v in results.items() if v]
mar_cols  = [c for c,v in results.items() if not v]

# === 3) Define zero_fill & knn_cols from MAR columns ===
zero_patterns = ['MNTH', 'NO_', 'PRI_', '_YR_', 'CRIFF', 'DEC_CRIFF']
zero_fill = [c for c in mar_cols if any(p in c for p in zero_patterns)]
knn_cols  = [c for c in mar_cols if c not in zero_fill]
cat_cols = X_train.select_dtypes(include='object').columns.drop(['TARGET','UNIQUE_ID'], errors='ignore')

In [5]:
# === 4) Preprocessor class ===
class Preprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, zero_fill_cols, mcar_cols, knn_cols,cat_cols,
                 frac_def=0.15, outlier_contam=0.01, corr_thresh=0.90):
        self.zero_fill_cols = zero_fill_cols
        self.mcar_cols      = mcar_cols
        self.knn_cols       = knn_cols
        self.frac_def       = frac_def
        self.outlier_contam = outlier_contam
        self.corr_thresh    = corr_thresh
        self.cat_cols       = cat_cols
    def fit(self, df, y=None):
        df0 = df.copy()
        print("Starting to fit data")
        # 1) Imputation
        df0[self.zero_fill_cols] = df0[self.zero_fill_cols].fillna(0)
        self._si_mcar = SimpleImputer(strategy='median')
        df0[self.mcar_cols] = self._si_mcar.fit_transform(df0[self.mcar_cols])
        # self._knn_imputer = KNNImputer(n_neighbors=5)
        # df0[self.knn_cols] = self._knn_imputer.fit_transform(df0[self.knn_cols])
        self._si_knn = SimpleImputer(strategy='median')
        df0[self.knn_cols] = self._si_knn.fit_transform(df0[self.knn_cols])
        self.si_cat = SimpleImputer(strategy='most_frequent')
        df0[self.cat_cols] = self.si_cat.fit_transform(df0[self.cat_cols])
        print("Imputation done")
        # 2) Flag encoding & duration parsing
        df0['SI_FLG']          = df0['SI_FLG'].map({'Y':1,'N':0})
        df0['LOCKER_HLDR_IND'] = df0['LOCKER_HLDR_IND'].map({'Y':1,'N':0})
        df0['UID_FLG']         = df0['UID_FLG'].map({'Y':1,'N':0})
        df0['INB_FLG']         = df0['INB_FLG'].map({'Y':1,'N':0})
        df0['EKYC_FLG']        = df0['EKYC_FLG'].map({'Y':1,'N':0})
        df0['KYC_FLG']         = df0['KYC_FLG'].replace({'N':0,'1':1,'2':2,'Y':2}).astype(int)

        def _parse(s):
            y,m = s.split('yrs ')
            return int(y)*12 + int(m.replace('mon',''))
        df0['AVERAGE_ACCT_AGE1']      = df0['AVERAGE_ACCT_AGE1'].apply(_parse)
        df0['CREDIT_HISTORY_LENGTH1'] = df0['CREDIT_HISTORY_LENGTH1'].apply(_parse)
        print("column parsing for those with years_months done")
        # 3) One‑hot encode remaining object cols
        cats = df0.select_dtypes(include='object').columns.tolist()
        if 'TARGET' in cats: cats.remove('TARGET')

        cats = df0.select_dtypes(include='object').columns.tolist()
        # 1) build train-side dummies and save their columns
        train_dummies = pd.get_dummies(df0[cats], drop_first=True, dtype=int)
        self._dummy_cols = train_dummies.columns.tolist()
        # 2) replace original cats with exactly those dummies
        df0 = pd.concat([df0.drop(columns=cats), train_dummies], axis=1)
        
        print("One Hot encoding done")
        # 4) Drop UNIQUE_ID
        df0 = df0.drop(columns=['UNIQUE_ID'], errors='ignore')

        # 5) Winsorize numeric tails
        num = [c for c in df0.select_dtypes(include=['int64','float64']).columns
               if c!='TARGET']
        self._winsor = {c: df0[c].quantile([0.01,0.99]).values for c in num}
        for c,(lo,hi) in self._winsor.items():
            df0[c] = df0[c].clip(lo,hi)
        print("Winzorization done")
        # 6) Yeo–Johnson on skewed continuous
        skew = df0[num].skew().abs().sort_values(ascending=False)
        cont = [c for c in skew[skew>2].index if df0[c].nunique()>10]
        self._pt = PowerTransformer(method='yeo-johnson', standardize=True)
        trans = self._pt.fit_transform(df0[cont])
        df0[cont] = np.clip(trans, -3, 3)
        
        print("Power Transformation of data done to reduce skewness")
        # 7) Drop highly correlated
        corr = df0[num].corr().abs()
        upper = corr.where(np.triu(np.ones(corr.shape),1).astype(bool))
        self._corr_drop = [c for c in upper.columns if (upper[c]>self.corr_thresh).any()]
        df0 = df0.drop(columns=self._corr_drop)

        print("Dropped columns which were highly correlated")
        # 8) Outlier flagging on TARGET==0, full coverage but fast
        nf = [c for c in df0.columns if df0[c].dtype in ['int64','float64'] and c!='TARGET']
        df_neg = df0[df0['TARGET']==0]
        
        # --- A) z-score on full negatives (cheap) ---
        z = np.abs(stats.zscore(df_neg[nf]))
        mask_z = (z > 3).any(axis=1)
        bad_z  = set(df_neg.index[mask_z])

        print("zscore calculation done to identify outliers")
        
        # --- B) IsolationForest trained on subsample, then predict on all ---
        iso = IsolationForest(contamination=self.outlier_contam, random_state=42)
        y_iso = iso.fit_predict(df_neg[nf])      # fit+predict on ~290k rows
        bad_iso = set(df_neg.index[y_iso == -1])
        print("Isolation forest calculation done to identify outliers")
        
        # --- Combine: require both methods to flag (2-of-2) ---
        drop0 = bad_z & bad_iso
        
        # --- Mix in a fraction of defaulters as before ---
        pos_idx = df0[df0['TARGET']==1].index
        n_def   = int(len(pos_idx)*self.frac_def)
        mix_idx = np.random.RandomState(42).choice(pos_idx, size=n_def, replace=False)
        
        self.outlier_idx_ = set(drop0).union(mix_idx)
        self.clean_idx_   = [i for i in df0.index if i not in self.outlier_idx_]
        self.features_    = [c for c in df0.columns if c!='TARGET']
        print("outliers cleaned")
        return self

    def transform(self, df, return_outliers=False):
        df0 = df.copy()

        # repeated steps 1–7 using fitted attributes
        df0[self.zero_fill_cols] = df0[self.zero_fill_cols].fillna(0)
        df0[self.mcar_cols]      = self._si_mcar.transform(df0[self.mcar_cols])
        # df0[self.knn_cols]       = self._knn_imputer.transform(df0[self.knn_cols])
        df0[self.knn_cols]       = self._si_knn.transform(df0[self.knn_cols])
        df0[self.cat_cols]       = self.si_cat.transform(df0[self.cat_cols])
        
        df0['SI_FLG']          = df0['SI_FLG'].map({'Y':1,'N':0})
        df0['LOCKER_HLDR_IND'] = df0['LOCKER_HLDR_IND'].map({'Y':1,'N':0})
        df0['UID_FLG']         = df0['UID_FLG'].map({'Y':1,'N':0})
        df0['INB_FLG']         = df0['INB_FLG'].map({'Y':1,'N':0})
        df0['EKYC_FLG']        = df0['EKYC_FLG'].map({'Y':1,'N':0})
        df0['KYC_FLG']         = df0['KYC_FLG'].replace({'N':0,'1':1,'2':2,'Y':2}).astype(int)
        df0['AVERAGE_ACCT_AGE1']      = df0['AVERAGE_ACCT_AGE1'].apply(lambda s: int(s.split('yrs ')[0])*12 + int(s.split('yrs ')[1].replace('mon','')))
        df0['CREDIT_HISTORY_LENGTH1'] = df0['CREDIT_HISTORY_LENGTH1'].apply(lambda s: int(s.split('yrs ')[0])*12 + int(s.split('yrs ')[1].replace('mon','')))

        cats = df0.select_dtypes(include='object').columns.tolist()
        test_dummies = pd.get_dummies(df0[cats], drop_first=True, dtype=int)

        # reindex to *exactly* the train-side columns, filling new/missing with 0
        test_dummies = test_dummies.reindex(
            columns=self._dummy_cols,
            fill_value=0
        )

        # drop the raw object-cols & concat our aligned dummies
        df0 = pd.concat(
            [df0.drop(columns=cats), test_dummies],
            axis=1
        )

        for c,(lo,hi) in self._winsor.items():
            df0[c] = df0[c].clip(lo,hi)

        cont = self._pt.feature_names_in_
        df0[cont] = np.clip(self._pt.transform(df0[cont]), -3, 3)

        df0 = df0.drop(columns=self._corr_drop, errors='ignore')

        X = df0[self.features_]

        if return_outliers and 'TARGET' in df0:
            return df0.loc[self.clean_idx_], df0.loc[list(self.outlier_idx_)]
        return X


In [6]:
# === 5) Fit & transform training data ===
prep = Preprocessor(
    zero_fill_cols=zero_fill,
    mcar_cols=mcar_cols,
    knn_cols=knn_cols,
    cat_cols=cat_cols,
    frac_def=0.15
).fit(pd.concat([X_train, y_train], axis=1))

Starting to fit data
Imputation done
column parsing for those with years_months done
One Hot encoding done
Winzorization done
Power Transformation of data done to reduce skewness
Dropped columns which were highly correlated
zscore calculation done to identify outliers
Isolation forest calculation done to identify outliers
outliers cleaned


In [7]:
df_tr_clean, df_tr_outliers = prep.transform(pd.concat([X_train, y_train], axis=1), return_outliers=True)
X_hold_prep = prep.transform(pd.concat([X_hold, y_hold], axis=1))
y_tr_clean         = df_tr_clean['TARGET']
X_tr_clean         = df_tr_clean.drop(columns=['TARGET'])
X_hold_clean       = X_hold_prep.copy()

# === 7) Summaries ===
print("Train clean shape:", df_tr_clean.shape)
print("Train outliers shape:", df_tr_outliers.shape)

Train clean shape: (255608, 109)
Train outliers shape: (6584, 109)


In [8]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import LogisticRegression

class StackingEnsembleLR:
    def __init__(self, n_folds=5, random_state=42):
        self.n_folds = n_folds
        self.random_state = random_state

    def fit(self, X, y):
        X = X.values if hasattr(X, 'values') else np.array(X)
        y = y.values if hasattr(y, 'values') else np.array(y)
        ratio = np.sum(y == 0) / np.sum(y == 1)

        self.base_models_ = [
            RandomForestClassifier(class_weight='balanced',
                                   n_estimators=200,
                                   random_state=self.random_state,
                                   n_jobs=-1),
            lgb.LGBMClassifier(class_weight='balanced',
                               n_estimators=200,
                               random_state=self.random_state,
                               verbose=-1),
            xgb.XGBClassifier(scale_pos_weight=ratio,
                              n_estimators=200,
                              use_label_encoder=False,
                              eval_metric='auc',
                              verbosity=0,
                              random_state=self.random_state)
        ]
        print("Base Models ready")
        n_models = len(self.base_models_)
        meta_train = np.zeros((X.shape[0], n_models))
        skf = StratifiedKFold(n_splits=self.n_folds,
                              shuffle=True,
                              random_state=self.random_state)
        print("Starting out of fold predictions")
        for i, model in enumerate(self.base_models_):
            oof = np.zeros(X.shape[0])
            for tr_idx, val_idx in skf.split(X, y):
                X_tr, y_tr = X[tr_idx], y[tr_idx]
                pos_idx = np.where(y_tr == 1)[0]
                neg_idx = np.where(y_tr == 0)[0]
                np.random.seed(self.random_state)
                neg_sample = np.random.choice(neg_idx, size=len(pos_idx), replace=False)
                keep = np.concatenate([pos_idx, neg_sample])
                model.fit(X_tr[keep], y_tr[keep])
                oof[val_idx] = model.predict_proba(X[val_idx])[:, 1]
            meta_train[:, i] = oof
        print("DONE training Stratified predictions model")

        # Logistic Regression
        self.meta_model_ = LogisticRegression(class_weight='balanced',
                                              max_iter=1000,
                                              random_state=self.random_state)
        self.meta_model_.fit(meta_train, y)
        print("TRAINED Logistic Regression")
        print("RETRAINING base models.....")
        for model in self.base_models_:
            model.fit(X, y)
        return self

    def predict_proba(self, X):
        X = X.values if hasattr(X, 'values') else np.array(X)
        meta_test = np.column_stack([
            model.predict_proba(X)[:, 1] for model in self.base_models_
        ])
        return self.meta_model_.predict_proba(meta_test)

    def predict(self, X):
        return (self.predict_proba(X)[:, 1] >= 0.5).astype(int)


In [9]:
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

# 0a) sanitize column names
safe_cols = {c: re.sub(r'[^0-9a-zA-Z_]', '_', c) for c in X_tr_clean.columns}
X_tr_clean   = X_tr_clean.rename(columns=safe_cols)
X_hold_clean = X_hold_clean.rename(columns=safe_cols)
train_feats = X_tr_clean.columns.tolist()

In [10]:
model = StackingEnsembleLR(n_folds=5, random_state=42)
model.fit(X_tr_clean, y_tr_clean)


Base Models ready
Starting out of fold predictions
DONE training Stratified predictions model
TRAINED Logistic Regression
RETRAINING base models.....


<__main__.StackingEnsembleLR at 0x7ce8ad5fba90>

In [11]:
X_hold_clean = X_hold_clean.reindex(columns=train_feats, fill_value=0)

In [12]:

probs = model.predict_proba(X_hold_clean)[:,1]
preds = model.predict(X_hold_clean)

print("AUC:", roc_auc_score(y_hold, probs))
print("Confusion matrix:\n", confusion_matrix(y_hold, preds))
print("Classification report:\n", classification_report(y_hold, preds))

AUC: 0.921400485662665
Confusion matrix:
 [[52884  5577]
 [ 1784  5304]]
Classification report:
               precision    recall  f1-score   support

           0       0.97      0.90      0.93     58461
           1       0.49      0.75      0.59      7088

    accuracy                           0.89     65549
   macro avg       0.73      0.83      0.76     65549
weighted avg       0.92      0.89      0.90     65549



In [14]:
import pandas as pd
df_test = pd.read_csv('/kaggle/input/sbi-data/HACKATHON_TRAINING_DATA/HACKATHON_PREDICTION_DATA.CSV')
df_test_work = df_test.copy()
X_test_clean = prep.transform(df_test_work)
X_test_clean = X_test_clean.rename(columns=safe_cols)
X_test_clean = X_test_clean.reindex(columns=train_feats, fill_value=0)
X_test_clean = X_test_clean.fillna(0)
df_test['pred'] = model.predict(X_test_clean)

df_test.to_csv('sbi_test_with_preds.csv', index=False)
df_test.to_parquet('sbi_test_with_preds.parquet', index=False)

In [15]:
df_test.head()

Unnamed: 0,ACCT_AGE,LIMIT,OUTS,ACCT_RESIDUAL_TENURE,LOAN_TENURE,INSTALAMT,SI_FLG,AGE,VINTAGE,KYC_SCR,...,CREDIT_HISTORY_LENGTH1,NO_OF_INQUIRIES1,INCOME_BAND1,AGREG_GROUP,PRODUCT_TYPE,LATEST_CR_DAYS,LATEST_DR_DAYS,TIME_PERIOD,UNIQUE_ID,pred
0,2.694,729200.0,541543.71,3.308,2192,15247.0,Y,38.915,18.765,110.0,...,4yrs 11mon,2.0,E,#Total Xpress Credit,PERSONAL LOAN,10.0,45715,FEB25,2202,0
1,5.652,980500.0,426219.82,2.349,2922,15836.0,Y,51.436,15.665,110.0,...,5yrs 6mon,0.0,F,#Total Auto Loan,AUTO LOAN,25.0,45715,FEB25,2209,0
2,5.737,980500.0,413595.82,2.265,2922,15836.0,Y,51.521,15.75,110.0,...,5yrs 6mon,0.0,F,#Total Auto Loan,AUTO LOAN,28.0,45746,MAR25,2211,0
3,6.479,735500.0,221620.79,1.607,2953,11996.0,Y,33.526,14.702,198.0,...,6yrs 4mon,0.0,F,#Total Auto Loan,AUTO LOAN,4.0,45746,MAR25,2217,0
4,6.394,735500.0,231762.79,1.692,2953,11996.0,Y,33.441,14.617,198.0,...,6yrs 4mon,0.0,F,#Total Auto Loan,AUTO LOAN,0.0,45715,FEB25,2218,0


In [17]:
df_test['pred'].value_counts()

pred
0    167031
1     24662
Name: count, dtype: int64