In [41]:
import pandas as pd
import numpy as np

from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MaxAbsScaler
from sklearn.ensemble import StackingClassifier, ExtraTreesClassifier
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.impute import SimpleImputer

from lightgbm import LGBMClassifier

from xgboost import XGBClassifier

In [11]:
train = pd.read_csv("data/playground_s5e11/train.csv")
test = pd.read_csv("data/playground_s5e11/test.csv")

In [81]:
cur_n_jobs = 2

xgb_params = dict(
    objective= "binary:logistic",
    eval_metric= "auc",          # важно — оптимизируем AUC напрямую
    learning_rate=0.03,         # чем меньше, тем стабильнее ранжирование
    n_estimators= 3000,          # большое число итераций, но используем early_stopping
    max_depth= 5,                # не слишком глубокие деревья
    min_child_weight= 3,         # защищает от переобучения на редких паттернах
    gamma= 0.1,                  # минимальное улучшение для разбиения
    subsample= 0.8,              # доля сэмплов
    colsample_bytree= 0.8,       # доля признаков
    reg_alpha= 0.1,              # L1
    reg_lambda= 5,               # L2
    scale_pos_weight= 1.0,       # если классы сбалансированы — 1
    random_state= 42,
    n_jobs= cur_n_jobs,
    tree_method= "hist",         # быстрее без потери точности
    enable_categorical= True     # если есть категориальные фичи (с XGB ≥ 1.6)
)

lgmc_tree_params = dict(
    n_estimators=3000,
    learning_rate=0.03,
    num_leaves=63,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=5,
    random_state=42,
    n_jobs=cur_n_jobs
)

log_params = {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.3, 'max_iter': 500, 'class_weight': 'balanced', 'random_state': 42}


In [83]:
num_sel = selector(dtype_include=['number'])
cat_sel = selector(dtype_include=['object', 'category', 'bool'])

cat_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])


num_pipe_tree = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
])  

svc_num = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scale', MaxAbsScaler())    
])


preprcesses_svc = ColumnTransformer(
    transformers=[
        ('cat', cat_pipe, cat_sel),
        ('num', svc_num, num_sel)
    ]
)

preprcesses_tree = ColumnTransformer(
    transformers=[
        ('cat', cat_pipe, cat_sel),
        ('num', num_pipe_tree, num_sel)
    ]
)




In [87]:
m1= Pipeline(steps=[
    ('prep', preprcesses_tree),
    ('model', XGBClassifier(**xgb_params))
])
m2 = Pipeline(steps=[
    ('prep', preprcesses_tree),
    ('model', LGBMClassifier(**lgmc_tree_params))
])

m3 = Pipeline(steps=[
    ('prep', preprcesses_svc),
    ('model', CalibratedClassifierCV(
        estimator=LinearSVC(C=0.5, random_state=42),
        method='sigmoid',
        cv=5
    ))
])

m4 = Pipeline(steps=[
    ('prep', preprcesses_svc),
    ('model', LogisticRegression(**logreg_params))
])

# meta_model = LGBMClassifier(max_depth=3, learning_rate=0.5, n_estimators=200)
meta_model = CalibratedClassifierCV(
    estimator=LinearSVC(C=0.1, random_state=42),
    cv=5, method='sigmoid'
)

In [64]:
X = train.drop(['id', 'loan_paid_back'], axis=1)
y = train['loan_paid_back']

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42, test_size=0.2)

In [36]:
m1.fit(X_train.copy(), y_train.copy())

0,1,2
,steps,"[('prep', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,True


In [37]:
m2.fit(X_train.copy(), y_train.copy())

[LightGBM] [Info] Number of positive: 379692, number of negative: 95503
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042573 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1385
[LightGBM] [Info] Number of data points in the train set: 475195, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.799024 -> initscore=1.380203
[LightGBM] [Info] Start training from score 1.380203


0,1,2
,steps,"[('prep', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,boosting_type,'gbdt'
,num_leaves,63
,max_depth,-1
,learning_rate,0.03
,n_estimators,3000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [65]:
m3.fit(X_train.copy(), y_train.copy())

0,1,2
,steps,"[('prep', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True

0,1,2
,estimator,LinearSVC(C=0...ndom_state=42)
,method,'sigmoid'
,cv,5
,n_jobs,
,ensemble,'auto'

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,0.5
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [89]:
m4.fit(X_train.copy(), y_train.copy())


0,1,2
,steps,"[('prep', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.3
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,500


In [71]:
# m1_pred = m1.predict_proba(X_val)[:, 1]
# m2_pred = m2.predict_proba(X_val)[:, 1]
# m3_pred = m3.predict_proba(X_val)[:, 1]
m4_pred = m4.predict_proba(X_val)[:, 1]

m1_auc = roc_auc_score(y_val, m1_pred)
m2_auc = roc_auc_score(y_val, m2_pred)
m3_auc = roc_auc_score(y_val, m3_pred)
m4_auc = roc_auc_score(y_val, m4_pred)

print(f"XGBC: {m1_auc}")
print(f"LightGBM: {m2_auc}")
print(f"SVC: {m3_auc}")
print(f"Logistic regression: {m4_auc}")

XGBC: 0.9226398859901933
LightGBM: 0.9232522685371819
SVC: 0.9115697769711211
Logistic regression: 0.9117134478416623


In [96]:
X_dummies = train.drop(["id", "loan_paid_back"], axis=1)
y = train["loan_paid_back"]

stack = StackingClassifier(
    estimators=[("xgb", m1), ("lgm", m2), ('svc', m3), ('log_reg', m4)],
    # estimators=[("xgb", m1), ("lgm", m2)],
    final_estimator=meta_model,
    stack_method="predict_proba",
    cv=4,
    n_jobs=4,
    passthrough=False
)
stack.fit(X_dummies, y)

[LightGBM] [Info] Number of positive: 355871, number of negative: 89625
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.125709 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1384
[LightGBM] [Info] Number of data points in the train set: 445496, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.798820 -> initscore=1.378934
[LightGBM] [Info] Start training from score 1.378934
[LightGBM] [Info] Number of positive: 355871, number of negative: 89625
[LightGBM] [Info] Number of positive: 355870, number of negative: 89625
[LightGBM] [Info] Number of positive: 355870, number of negative: 89625
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.167890 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.




0,1,2
,estimators,"[('xgb', ...), ('lgm', ...), ...]"
,final_estimator,CalibratedCla...dom_state=42))
,cv,4
,stack_method,'predict_proba'
,n_jobs,4
,passthrough,False
,verbose,0

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,True

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,boosting_type,'gbdt'
,num_leaves,63
,max_depth,-1
,learning_rate,0.03
,n_estimators,3000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True

0,1,2
,estimator,LinearSVC(C=0...ndom_state=42)
,method,'sigmoid'
,cv,5
,n_jobs,
,ensemble,'auto'

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,0.5
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.3
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,500

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,0.1
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [193]:
c_f = SelectFromModel(
    RandomForestClassifier(
        n_estimators=200,
        n_jobs=-1,
        random_state=42
    ),
    threshold="median" 
)

c

SelectFromModel(estimator=RandomForestClassifier(n_estimators=200, n_jobs=-1,
                                                 random_state=42),
                threshold='median')


In [99]:
X_res = test.drop(['id'], axis=1)
pred_prob =  stack.predict_proba(X_res)[:, 1]
submission = pd.DataFrame({
    "id" : test['id'].copy(),
    "loan_paid_back": pred_prob
})

submission.to_csv("submission.csv", index=False)



In [100]:
!kaggle competitions submit -c playground-series-s5e11 -f submission.csv -m "Fourth try"


100%|███████████████████████████████████████| 6.33M/6.33M [00:21<00:00, 307kB/s]
Successfully submitted to Predicting Loan Payback

In [115]:
model1 = stack.estimators_[0]
model2 = stack.estimators_[1]
model3 = stack.estimators_[2]