In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e10/sample_submission.csv
/kaggle/input/playground-series-s4e10/train.csv
/kaggle/input/playground-series-s4e10/test.csv
/kaggle/input/loan-approval-prediction/credit_risk_dataset.csv


In [2]:
from scipy.stats import norm, skew
from scipy import stats
import pandas as pd
import numpy as np
import seaborn as sns
color=sns.color_palette()
sns.set_style('darkgrid')
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import svm

import warnings
warnings.filterwarnings('ignore')


In [None]:
train=pd.read_csv('../input/playground-series-s4e10/train.csv')
test=pd.read_csv('../input/playground-series-s4e10/test.csv')

In [None]:
print("train_size:{}".format(train.shape))
print("test_size:{}".format(test.shape))

In [None]:
train.head(20)

In [None]:
train.info()

In [None]:
test.info()

In [None]:
test.head(20)

In [None]:
original = pd.read_csv('/kaggle/input/loan-approval-prediction/credit_risk_dataset.csv')

In [None]:
original.head(20)

In [None]:
original.info()

I noticed that loan_int_rate have a strong correlation with loan_grade! So, I used the average value of each loan_grade to fill in loan_int_rate

In [None]:
original['loan_int_rate'] = pd.to_numeric(original['loan_int_rate'], errors='coerce')
mean_rates = original.groupby('loan_grade')['loan_int_rate'].mean()
for grade, mean_rate in mean_rates.items():
    original.loc[(original['loan_grade'] == grade) & (original['loan_int_rate'].isna()), 'loan_int_rate'] = mean_rate
print(original)
original.to_csv('filled_credit_risk_dataset.csv', index=False)

new_original = pd.read_csv('filled_credit_risk_dataset.csv')
new_original.head(100)

In [None]:
missing_int_rate = new_original[new_original['loan_int_rate'].isna()]
print(missing_int_rate)


I tried to find the correlation between person_emp_length and person_age

In [None]:
sampled_data = original.sample(n=12, random_state=42)
data = sampled_data[['person_age', 'person_emp_length']]
plt.figure(figsize=(10, 6))
sns.scatterplot(x='person_age', y='person_emp_length', data=data)
slope, intercept = np.polyfit(data['person_age'], data['person_emp_length'], 1)
line = slope * data['person_age'] + intercept
plt.plot(data['person_age'], line, color='red', label='Regression Line')
plt.text(0.05, 0.95, f'Line: y = {slope:.2f}x + {intercept:.2f}', 
         transform=plt.gca().transAxes, fontsize=12, verticalalignment='top')

plt.title('Relationship between Person Age and Employment Length')
plt.xlabel('Person Age')
plt.ylabel('Employment Length (Years)')
plt.legend()

plt.grid(True)

plt.show()

In [None]:
sampled_data = original.sample(n=1000, random_state=42)
data = sampled_data[['person_age', 'person_emp_length']]
plt.figure(figsize=(10, 6))
sns.scatterplot(x='person_age', y='person_emp_length', data=data)

plt.title('Relationship between Person Age and Employment Length (Sampled Data)')
plt.xlabel('Person Age')
plt.ylabel('Employment Length (Years)')

plt.grid(True)

plt.show()

I did not find the correlation between person_emp_length and person_age. So I just used average value to fill in.

In [None]:
new_original['person_emp_length'].fillna(new_original['person_emp_length'].mean(), inplace=True)

In [None]:
new_original.isnull().sum()

In [7]:
obj_cols = (['person_home_ownership', 'loan_intent', 'loan_grade',
            'cb_person_default_on_file'])
num_cols = (['person_age', 'person_income', 'person_emp_length',
            'loan_amnt', 'loan_int_rate', 'loan_percent_income', 
            'cb_person_cred_hist_length'])

features = obj_cols + num_cols 

In [None]:
train['label'] = 0
new_original['label'] = 1
target = 'label'

all_cols = features + [target]

In [None]:
train[all_cols].shape, new_original[all_cols].shape

In [None]:
def create_adversarial_data(train, new_original, cols, N_val=50000):
    combined_data = pd.concat([train[cols], new_original[cols]], axis=0)
    adversarial_test = combined_data.sample(N_val, replace=False)
    adversarial_train = combined_data.drop(adversarial_test.index)
    return adversarial_train, adversarial_test

adversarial_train, adversarial_test = create_adversarial_data(train, new_original, all_cols)
adversarial_train.shape, adversarial_test.shape

In [None]:
import catboost
from catboost import Pool, CatBoostClassifier

train_data = Pool(
    data=adversarial_train[features],
    label=adversarial_train[target],
    cat_features=obj_cols
)
holdout_data = Pool(
    data=adversarial_test[features],
    label=adversarial_test[target],
    cat_features=obj_cols
)

catboost_params = {
    'iterations': 100,
    'eval_metric': 'AUC',
    'od_type': 'Iter',
    'od_wait': 50,
    'random_seed': 50,
    'verbose': 0
}

model = CatBoostClassifier(**catboost_params)
_ = model.fit(train_data, eval_set=holdout_data)

In [None]:
# Set up ROC Curve plot
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_curve, roc_auc_score, log_loss
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import ConfusionMatrixDisplay

def plot_roc(y_trues, y_preds, labels, x_max=1.0):
    fig, ax = plt.subplots()
    for i, (y_true, y_pred) in enumerate(zip(y_trues, y_preds)): 
        fpr, tpr, _ = roc_curve(y_true, y_pred)
        auc_score = roc_auc_score(y_true, y_pred)
        ax.plot(fpr, tpr, label=f'{labels[i]}; AUC={auc_score:.3f}', marker='o', markersize=1)
    

    ax.plot(np.linspace(0, 1, 20), np.linspace(0, 1, 20), linestyle='--', color='grey') 
    ax.legend()
    ax.grid()
    ax.set_title('ROC curve')
    ax.set_xlabel('False Positive Rate')
    ax.set_xlim([-0.01, x_max])
    ax.set_ylabel('True Positive Rate')
    
# Plot
plot_roc(
    y_trues=[holdout_data.get_label()],
    y_preds=[model.predict_proba(holdout_data)[:,1]],
    labels=['Baseline']
)

In [None]:
import lightgbm as lgb
from lightgbm import LGBMClassifier

import shap
shap.initjs()

# Define function to plot feature importance
def plot_importances(model, holdout_data, features):
    shap_values = model.get_feature_importance(holdout_data, type='ShapValues')
    expected_value = shap_values[0,-1]
    shap_values = shap_values[:,:-1]
    shap.summary_plot(shap_values, holdout_data, feature_names=features, plot_type='bar')
    
# Plot feature importance
plot_importances(model, holdout_data, features)

In [None]:
# Remove 'person_income' and retrain the model
updated_params = catboost_params.copy()
updated_params['ignored_features'] = ['person_income']
model2 = CatBoostClassifier(**updated_params)
model2.fit(train_data, eval_set=holdout_data, plot=True, verbose=False)

In [None]:
# Plot updated ROC Curve plot
plot_roc(
    y_trues=[holdout_data.get_label()]*2,
    y_preds=[model.predict_proba(holdout_data)[:, 1], model2.predict_proba(holdout_data)[:, 1]],
    labels=['Baseline', "Removing 'person_income'"] 
)

In [None]:
# Remove 'loan_int_rate' and retrain the model
updated_params = catboost_params.copy()
updated_params['ignored_features'] = ['loan_int_rate']
model2 = CatBoostClassifier(**updated_params)
model2.fit(train_data, eval_set=holdout_data, plot=True, verbose=False)

In [None]:
# Plot updated ROC Curve plot
plot_roc(
    y_trues=[holdout_data.get_label()]*2,
    y_preds=[model.predict_proba(holdout_data)[:, 1], model2.predict_proba(holdout_data)[:, 1]],
    labels=['Baseline', "Removing 'loan_int_rate'"] 
)

AUC = 0.6 to 0.7, this is not close to 0.5 or 1,so the train data and the original data do not have same distribution. let me remove original data.

In [3]:
train=pd.read_csv('../input/playground-series-s4e10/train.csv')
test=pd.read_csv('../input/playground-series-s4e10/test.csv')

In [None]:
def preprocess(df):
    df['loan_to_income'] = ((df['loan_amnt'] / df['person_income']) - df['loan_percent_income']).astype('string').astype('category')
    df['age_income_interaction'] = (df['person_age'] * df['person_income']).astype('string').astype('category')
    df['loan_to_emp_length_ratio'] = (df['loan_amnt'] / df['person_emp_length'].replace({'None': original['person_emp_length'].mean()}).astype('float')).astype('string').astype('category')
    monthly_income = df['person_income'] / 12
    df['monthly_debt'] = (df['loan_amnt'] * (1 + df['loan_int_rate'].replace({'None': original['loan_int_rate'].mean()})) / 12)
    df['dti_ratio'] = (df['monthly_debt'] / monthly_income).astype('string').astype('category')
    df['monthly_debt'] = df['monthly_debt'].astype('string').astype('category')
    df['risk_flag'] = (np.where((df['cb_person_default_on_file'] == 'Y') & (df['loan_grade'].isin(['C', 'D', 'E'])), 1, 0))
    df['risk_flag'] = df['risk_flag'].astype('category')
    df['person_home_ownership'] = df['person_home_ownership'].astype('category')
    df['loan_intent'] = df['loan_intent'].astype('category')
    df['loan_grade'] = df['loan_grade'].astype('category')
    df['cb_person_default_on_file'] = df['cb_person_default_on_file'].astype('category')
    df['person_emp_length'] = df['person_emp_length'].astype('string').astype('category')
    df['loan_int_rate'] = (df['loan_int_rate'] * 100).astype('string').astype('category')
    df['loan_percent_income'] = (df['loan_percent_income'] * 100).astype('string').astype('category')

preprocess(test)
preprocess(train)
y = train.pop('loan_status')

In [None]:
import optuna
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from optuna.samplers import TPESampler

def objective(trial):
    params = {
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'iterations': 1000,
        'depth': trial.suggest_int('depth', 5, 10),
        'random_strength': 0,
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-1, 1.0, log=True),
        'task_type': 'GPU',
        'random_seed': 50,
        'verbose': False
    }
    
    model = CatBoostClassifier(**params)
    cv = StratifiedKFold(5, shuffle=True, random_state=0)
    cv_splits = cv.split(train, y)
    scores = []
    
    # Cross-validation loop
    for train_idx, val_idx in cv_splits:
        train_train_fold, train_val_fold = train.iloc[train_idx], train.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
        
        train_train_pool = Pool(train_train_fold, y_train_fold, cat_features=train.columns.values)
        train_valid_pool = Pool(train_val_fold, y_val_fold, cat_features=train.columns.values)
        
        model.fit(train_pool=train_train_pool, eval_set=train_valid_pool, verbose=False, early_stopping_rounds=200)
        val_pred = model.predict_proba(train_valid_pool)[:, 1]
        score = roc_auc_score(y_val_fold, val_pred)
        scores.append(score)
    
    return np.mean(scores)

# Optuna
sqlite_db = "sqlite:///catboost.db"
study_name = "catboost"
optimize = False

if optimize:
    study = optuna.create_study(storage=sqlite_db, study_name=study_name, 
                                sampler=TPESampler(n_startup_trials=35, multivariate=True, seed=0),
                                direction="maximize", load_if_exists=True)

    study.optimize(objective, n_trials=100)
    print(f"Best optimized roc-auc: {study.best_value:0.5f}")
    print(f"Best hyperparameters: {study.best_params}")
    catboost_params = study.best_params
else:
    catboost_params = {
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'learning_rate': 0.08114394459649094,
        'iterations': 1000,
        'depth': 6,
        'random_strength': 0,
        'l2_leaf_reg': 0.7047064221215757,
        'task_type': 'GPU',
        'random_seed': 50,
        'verbose': False    
    }

In [None]:
# Remove 'task_type' if it's already in catboost_params to avoid duplication
catboost_params.pop('task_type', None)

cv = StratifiedKFold(5, shuffle=True, random_state=0)
cv_splits = cv.split(train, y)
scores = []
test_preds = []

train_test_pool = Pool(test, cat_features=train.columns.values)

# Cross-validation loop
for i, (train_idx, val_idx) in enumerate(cv_splits):
    train_train_fold, train_val_fold = train.loc[train_idx], train.loc[val_idx]
    y_train_fold, y_val_fold = y.loc[train_idx], y.loc[val_idx]
    
    # Create Pool objects for training and validation
    train_train_pool = Pool(train_train_fold, y_train_fold, cat_features=train.columns.values)
    train_valid_pool = Pool(train_val_fold, y_val_fold, cat_features=train.columns.values)
    
    # Initialize the CatBoostClassifier with the parameters, enforcing 'CPU' task type
    model = CatBoostClassifier(**catboost_params, task_type='CPU')
    
    # Fit the model with early stopping
    model.fit(train_train_pool, eval_set=train_valid_pool, verbose=False, early_stopping_rounds=200)
    
    # Predict probabilities and calculate ROC AUC score
    val_pred = model.predict_proba(train_valid_pool)[:, 1]
    score = roc_auc_score(y_val_fold, val_pred)
    
    # Store validation scores and test set predictions
    scores.append(score)
    test_pred = model.predict_proba(train_test_pool)[:, 1]
    test_preds.append(test_pred)
    
    print(f'Fold {i + 1} roc_auc_score: {score:.3f}')

print(f'Cross-validated roc_auc_score: {np.mean(scores):.3f} +/- {np.std(scores):.3f}')
print(f'Max roc_auc_score score: {np.max(scores):.3f}')
print(f'Min roc_auc_score score: {np.min(scores):.3f}')

In [None]:
submission = pd.read_csv('../input/playground-series-s4e10/sample_submission.csv')
submission['loan_status'] = np.mean(test_preds, axis=0)
submission.to_csv('submission.csv', index=False)
submission.head()

another try

In [8]:
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, RandomizedSearchCV, GridSearchCV, RepeatedStratifiedKFold, cross_val_score, cross_val_predict, RepeatedKFold
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier, StackingClassifier
from sklearn.metrics import roc_auc_score

In [10]:
obj_cols = (['person_home_ownership', 'loan_intent', 'loan_grade',
            'cb_person_default_on_file'])
num_cols = (['person_age', 'person_income', 'person_emp_length',
            'loan_amnt', 'loan_int_rate', 'loan_percent_income', 
            'cb_person_cred_hist_length'])

def preprocess_data(df, obj_cols, train=False):
    
    for col in obj_cols:
        df[col] = df[col].astype('category')
    
    df['person_age'] = df['person_age'].astype('int32')#0~4,294,967,295
    df['cb_person_cred_hist_length'] = df['cb_person_cred_hist_length'].astype('int32') 
    df['person_age'] = df['person_age'].clip(None, 80)  #below 80
    df['person_emp_length'] = df['person_emp_length'].clip(None, 65) #below 65
    
    if train:

        df['loan_status'] = df['loan_status'].astype('int8')#0~255  0と1しかない時はint8
        
    return df
        
train = preprocess_data(train, obj_cols ,train=True)
test = preprocess_data(test, obj_cols)

RandomForestClassifier model

In [11]:
X = train.drop(columns=['loan_status'], axis=1)
Y = train['loan_status']

skf = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)

train_dummies = pd.get_dummies(X[obj_cols], drop_first=True, dtype='int8')
test_dummies = pd.get_dummies(test[obj_cols], drop_first=True, dtype='int8')

train_dummies, test_dummies = train_dummies.align(test_dummies, join='left', axis=1, fill_value=0)

RF_train = pd.concat([X.drop(columns=obj_cols, axis=1), train_dummies], axis=1)
RF_test = pd.concat([test.drop(columns=obj_cols, axis=1), test_dummies], axis=1)

In [None]:
%%time

# random research
param_dist = {
    'n_estimators': [100, 150],
    'max_depth': [10, 15],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

random_search = RandomizedSearchCV(RandomForestClassifier(n_jobs=-1), param_dist, n_iter=50, cv=5, scoring='roc_auc', n_jobs=-1, random_state=1)
random_search.fit(RF_train, Y)

print("Best parameters: ", random_search.best_params_)

best_rf_md = random_search.best_estimator_

scores, rf_oof_preds, rf_test_preds = [], [], []

for i, (train_index, test_index) in enumerate(skf.split(RF_train, Y)):
    print(f"Fold {i} ")
    
    X_train, X_test = RF_train.iloc[train_index], RF_train.iloc[test_index]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
    
    preds = best_rf_md.predict_proba(X_test)[:, 1]
    
    oof_preds = pd.DataFrame({'y': y_test.values, 'rf_preds': preds, 'fold': i})
    rf_oof_preds.append(oof_preds)
    
    score = roc_auc_score(y_test, preds)
    print(f"The oof ROC-AUC score is {score}")
    scores.append(score)
    
    test_preds = pd.DataFrame({'rf_preds': best_rf_md.predict_proba(RF_test)[:, 1], 'fold': i})
    rf_test_preds.append(test_preds)

rf_oof_score = np.mean(scores)
rf_std = np.std(scores)
print(f"The average oof ROC-AUC score of the RandomForest model is {rf_oof_score}")
print(f"The std oof ROC-AUC score of the RandomForest model is {rf_std}")

importances = best_rf_md.feature_importances_
features = RF_train.columns
important_features = pd.Series(importances, index=features).sort_values(ascending=False)
print(important_features.head(10))  


LightGBM model

In [16]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMClassifier

# ランダムサーチで連続的なパラメータ範囲を設定する
lgb_param_dist = {
    'learning_rate': np.linspace(0.09, 0.1, 100),  # 0.01から0.1までの連続値を100個生成
    'n_estimators': np.arange(50, 201, 10),        # 50から150まで10刻みで探索
    'max_depth': np.arange(10, 21, 1),              # 5から15までの整数値を探索
    'reg_alpha': np.random.uniform(0, 0.5, 100),   # 0から0.5までの連続値をランダムに100個生成
    'reg_lambda': np.random.uniform(0, 0.5, 100),  # 同上
    'num_leaves': np.arange(20, 41, 1),            # 20から40までの整数値を探索
    'colsample_bytree': np.linspace(0.3, 0.7, 100) # 0.3から0.7までの連続値を100個生成
}

# ランダムサーチの実行 (n_iter=50にして、より多くの組み合わせを探索)
random_search = RandomizedSearchCV(
    LGBMClassifier(n_jobs=-1, verbose=-1),
    lgb_param_dist,
    n_iter=50,  # 50回の異なる組み合わせをランダムに試す
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    random_state=42
)

# モデルの学習
random_search.fit(X, Y)

# 最適なパラメータの表示
print("Best parameters found: ", random_search.best_params_)


Best parameters found:  {'reg_lambda': 0.4024415902150582, 'reg_alpha': 0.3457054016562055, 'num_leaves': 21, 'n_estimators': 180, 'max_depth': 13, 'learning_rate': 0.09898989898989899, 'colsample_bytree': 0.5343434343434343}


In [17]:
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

X = train.drop(columns=['loan_status'], axis=1)
Y = train['loan_status']

best_lgb_params = {
    'learning_rate': 0.09898989898989899,
    'n_estimators': 180,
    'max_depth': 13,
    'reg_alpha': 0.3457054016562055,
    'reg_lambda': 0.4024415902150582,
    'num_leaves': 21,
    'colsample_bytree': 0.5,
    'verbose': -1,
    'n_jobs': -1
}

scores, lgb_oof_preds, lgb_test_preds = list(), list(), list()

for i, (train_index, test_index) in enumerate(skf.split(X, Y)):
    print(f" Fold {i} ")
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = Y[train_index], Y.iloc[test_index]
    
    lgb_md = LGBMClassifier(**best_lgb_params)
    
    lgb_md.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],  
        eval_metric='auc',            
        callbacks=[early_stopping(10), log_evaluation(0)]  
    )

    preds = lgb_md.predict_proba(X_test)[:, 1]
    
    oof_preds = pd.DataFrame()
    oof_preds['y'] = y_test.values
    oof_preds['lgb_preds'] = preds
    oof_preds['fold'] = i
    lgb_oof_preds.append(oof_preds)
    
    score = roc_auc_score(y_test, preds)
    print(f"The oof ROC-AUC score is {score}")
    scores.append(score)
    
    test_preds = pd.DataFrame()
    test_preds['lgb_preds'] = lgb_md.predict_proba(test)[:, 1]
    test_preds['fold'] = i
    lgb_test_preds.append(test_preds)

lgb_oof_score = np.mean(scores)
lgb_std = np.std(scores)
print(f"The average oof ROC-AUC score of the LGBM model is {lgb_oof_score}")
print(f"The std oof ROC-AUC score of the LGBM model is {lgb_std}")


 Fold 0 
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[110]	valid_0's auc: 0.95714	valid_0's binary_logloss: 0.146933
The oof ROC-AUC score is 0.9571397021048864
 Fold 1 
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[155]	valid_0's auc: 0.955991	valid_0's binary_logloss: 0.148183
The oof ROC-AUC score is 0.9559909706385022
 Fold 2 
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[180]	valid_0's auc: 0.958587	valid_0's binary_logloss: 0.143322
The oof ROC-AUC score is 0.9585867930110549
 Fold 3 
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[177]	valid_0's auc: 0.957352	valid_0's binary_logloss: 0.151542
The oof ROC-AUC score is 0.9573515063520438
 Fold 4 
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[

CatBoost

In [None]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
X = train.drop(columns=['loan_status'], axis=1)
Y = train['loan_status']

cat_features = X.select_dtypes(include=['category']).columns.tolist()


# CatBoost用の連続的なパラメータ範囲設定
cat_param_dist = {
    'learning_rate': np.linspace(0.09, 0.1, 100),    # 0.01から0.1の範囲で100個の値を探索
    'depth': np.arange(8, 16, 1),                    # 4から10までの整数値を探索
    'l2_leaf_reg': np.random.uniform(0, 5, 100),     # 1から9の範囲で連続的に探索
    'iterations': np.arange(100, 201, 50),           # 100から300まで50刻み
    'border_count': np.arange(32, 129, 32),          # 32, 64, 96, 128といった範囲を探索
    'random_strength': np.random.uniform(1, 3, 100)  # 1から3の範囲でランダムに探索
}

# ランダムサーチの実行
random_search = RandomizedSearchCV(
    estimator=CatBoostClassifier(verbose=0),  
    param_distributions=cat_param_dist,
    n_iter=50,  # より多くの組み合わせを試す
    scoring='roc_auc',
    cv=3,
    random_state=42,
    n_jobs=-1
)

# カテゴリ変数のリストを指定して学習
random_search.fit(X, Y, cat_features=cat_features)

# 最適なパラメータを表示
print("Best parameters: ", random_search.best_params_)


In [20]:
# Best parameters from RandomizedSearchCV
best_cat_params = {
    'random_strength': 1.781134720536659,
    'learning_rate': 0.07818181818181819,
    'l2_leaf_reg': 2.3636726248162034,
    'iterations': 300,
    'depth': 5,
    'border_count': 96,
    'verbose': 0,  # 学習時の詳細を出力しない
}

# 交差検証用のスコア保存リスト
scores, cat_oof_preds, cat_test_preds = list(), list(), list()

# 交差検証の実行
for i, (train_index, test_index) in enumerate(skf.split(X, Y)):
    print(f"Fold {i} ")
    
    # 訓練データとテストデータに分割
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = Y[train_index], Y.iloc[test_index]
    
    # CatBoostモデルに最適なパラメータを設定してインスタンス化
    cat_md = CatBoostClassifier(**best_cat_params)
    
    # モデルの学習
    cat_md.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],  
        cat_features=cat_features,  # カテゴリ変数のリスト
        early_stopping_rounds=10,   # 早期終了の設定
        use_best_model=True         # 最良のモデルを保存
    )
    
    # テストデータに対する予測
    preds = cat_md.predict_proba(X_test)[:, 1]
    
    # 予測結果を保存
    oof_preds = pd.DataFrame()
    oof_preds['y'] = y_test.values
    oof_preds['cat_preds'] = preds
    oof_preds['fold'] = i
    cat_oof_preds.append(oof_preds)
    
    # ROC-AUCスコアの計算
    score = roc_auc_score(y_test, preds)
    print(f"The oof ROC-AUC score is {score}")
    scores.append(score)
    
    # テストデータでの予測結果も保存
    test_preds = pd.DataFrame()
    test_preds['cat_preds'] = cat_md.predict_proba(test)[:, 1]
    test_preds['fold'] = i
    cat_test_preds.append(test_preds)

# 交差検証の平均スコアと標準偏差の計算
cat_oof_score = np.mean(scores)
cat_std = np.std(scores)
print(f"The 10-fold average oof ROC-AUC score of the CatBoost model is {cat_oof_score}")
print(f"The 10-fold std oof ROC-AUC score of the CatBoost model is {cat_std}")


Fold 0 
The oof ROC-AUC score is 0.9509657690285996
Fold 1 
The oof ROC-AUC score is 0.9491100114117128
Fold 2 
The oof ROC-AUC score is 0.9479355634094174
Fold 3 
The oof ROC-AUC score is 0.9478491272748271
Fold 4 
The oof ROC-AUC score is 0.9422702462655959
The 10-fold average oof ROC-AUC score of the CatBoost model is 0.9476261434780306
The 10-fold std oof ROC-AUC score of the CatBoost model is 0.0029046879446977883


In [None]:
X = train.drop(columns=['loan_status'], axis=1)
Y = train['loan_status']

cat_features = X.select_dtypes(include=['category']).columns.tolist()

cat_param_dist = {
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8, 10],             
    'l2_leaf_reg': [1, 3, 5, 7, 9],     
    'iterations': [100, 200, 300],      
    'border_count': [32, 64, 128],      
    'random_strength': [1, 2, 3]       
}

random_search = RandomizedSearchCV(
    estimator=CatBoostClassifier(verbose=0),  
    param_distributions=cat_param_dist,
    n_iter=15,
    scoring='roc_auc',
    cv=3,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X, Y, cat_features=cat_features)

print("Best parameters: ", random_search.best_params_)

best_cat_params = random_search.best_params_
scores, cat_oof_preds, cat_test_preds = list(), list(), list()

for i, (train_index, test_index) in enumerate(skf.split(X, Y)):
    print(f"Fold {i} ")
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = Y[train_index], Y.iloc[test_index]
    
    cat_md = CatBoostClassifier(**best_cat_params, verbose=0)
    cat_md.fit(X_train, y_train, eval_set=[(X_test, y_test)], cat_features=cat_features, early_stopping_rounds=10, use_best_model=True)
    
    preds = cat_md.predict_proba(X_test)[:, 1]
    
    oof_preds = pd.DataFrame()
    oof_preds['y'] = y_test.values
    oof_preds['cat_preds'] = preds
    oof_preds['fold'] = i
    cat_oof_preds.append(oof_preds)
    
    score = roc_auc_score(y_test, preds)
    print(f"The oof ROC-AUC score is {score}")
    scores.append(score)
    
    test_preds = pd.DataFrame()
    test_preds['cat_preds'] = cat_md.predict_proba(test)[:, 1]
    test_preds['fold'] = i
    cat_test_preds.append(test_preds)

cat_oof_score = np.mean(scores)
cat_std = np.std(scores)
print(f"The 10-fold average oof ROC-AUC score of the CatBoost model is {cat_oof_score}")
print(f"The 10-fold std oof ROC-AUC score of the CatBoost model is {cat_std}")


In [None]:
# 目的変数と特徴量の分割
X = train.drop(columns=['loan_status'], axis=1)
Y = train['loan_status']

# カテゴリ変数の指定
cat_features = X.select_dtypes(include=['category']).columns.tolist()

# CatBoostのパラメータ範囲を設定
cat_param_dist = {
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8, 10],             # ツリーの深さ
    'l2_leaf_reg': [1, 3, 5, 7, 9],     # L2正則化項
    'iterations': [100, 200, 300],      # イテレーション回数
    'border_count': [32, 64, 128],      # binの数
    'random_strength': [1, 2, 3]        # ランダム性の強さ
}

# 10-foldのStratifiedKFoldの設定
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# ランダムサーチの実行 (n_iterを15に減少、cv=3に減少)
random_search = RandomizedSearchCV(
    estimator=CatBoostClassifier(verbose=0),  # verbose=0 でログ出力を抑える
    param_distributions=cat_param_dist,
    n_iter=15,
    scoring='roc_auc',
    cv=3,
    random_state=42,
    n_jobs=-1
)

# カテゴリ変数を指定して学習
random_search.fit(X, Y, cat_features=cat_features)

# 最適なパラメータを表示
print("Best parameters found: ", random_search.best_params_)

# 最適なパラメータでCatBoostモデルを再学習
best_cat_params = random_search.best_params_
scores, cat_oof_preds, cat_test_preds = list(), list(), list()

for i, (train_index, test_index) in enumerate(skf.split(X, Y)):
    print(f"------------ Working on Fold {i} ------------")
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = Y[train_index], Y.iloc[test_index]
    
    # CatBoostClassifierモデルで再学習
    cat_md = CatBoostClassifier(**best_cat_params, verbose=0)
    cat_md.fit(X_train, y_train, eval_set=[(X_test, y_test)], cat_features=cat_features, early_stopping_rounds=10, use_best_model=True)
    
    preds = cat_md.predict_proba(X_test)[:, 1]
    
    oof_preds = pd.DataFrame()
    oof_preds['y'] = y_test.values
    oof_preds['cat_preds'] = preds
    oof_preds['fold'] = i
    cat_oof_preds.append(oof_preds)
    
    score = roc_auc_score(y_test, preds)
    print(f"The oof ROC-AUC score is {score}")
    scores.append(score)
    
    test_preds = pd.DataFrame()
    test_preds['cat_preds'] = cat_md.predict_proba(test)[:, 1]
    test_preds['fold'] = i
    cat_test_preds.append(test_preds)

cat_oof_score = np.mean(scores)
cat_std = np.std(scores)
print(f"The 10-fold average oof ROC-AUC score of the CatBoost model is {cat_oof_score}")
print(f"The 10-fold std oof ROC-AUC score of the CatBoost model is {cat_std}")
