In [1]:
pip install optuna



In [2]:
pip install optuna-integration[lightgbm]



In [3]:
pip install category_encoders



In [4]:
pip install imbalanced-learn



In [5]:
pip install lightgbm --upgrade



In [6]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import zipfile
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import optuna
from optuna.integration import LightGBMPruningCallback
import category_encoders as ce
from imblearn.over_sampling import SMOTE

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [7]:
# Extract the uploaded zip file
zip_file_path = '/content/playground-series-s4e10.zip'
extract_dir = '/content/playground-series-s4e10/'

In [8]:
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

In [9]:
# Load the datasets
train_df = pd.read_csv(os.path.join(extract_dir, 'train.csv'))
test_df = pd.read_csv(os.path.join(extract_dir, 'test.csv'))
sample_submission_df = pd.read_csv(os.path.join(extract_dir, 'sample_submission.csv'))


In [10]:
# Reset index to 'id' for both datasets
train_df.set_index('id', inplace=True)
test_df.set_index('id', inplace=True)

In [11]:
# Identify categorical and numerical features
categorical_features = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
numerical_features = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
                      'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']


In [12]:
# Feature Engineering: Create new features
def feature_engineering(df):
    # Income to loan amount ratio
    df['income_loan_ratio'] = df['person_income'] / df['loan_amnt']
    # Employment length to age ratio
    df['emp_age_ratio'] = df['person_emp_length'] / df['person_age']
    # Interest rate to income ratio
    df['int_rate_income_ratio'] = df['loan_int_rate'] / df['person_income']
    # Credit history length to age ratio
    df['cred_hist_age_ratio'] = df['cb_person_cred_hist_length'] / df['person_age']
    return df

In [13]:
train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

In [14]:
# Update numerical features with new features
numerical_features.extend(['income_loan_ratio', 'emp_age_ratio', 'int_rate_income_ratio', 'cred_hist_age_ratio'])


In [15]:
# Separate features and target
X = train_df.drop(columns='loan_status')
y = train_df['loan_status']


In [16]:
# Initialize Target Encoder
target_enc = ce.TargetEncoder(cols=categorical_features)


In [17]:
# Initialize Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [18]:
# Prepare arrays for out-of-fold predictions
oof_preds = np.zeros(X.shape[0])
test_preds = np.zeros(test_df.shape[0])

In [21]:
# Define objective function for Optuna
def objective(trial):
    aucs = []
    for train_index, valid_index in skf.split(X, y):
        X_train_fold, X_valid_fold = X.iloc[train_index], X.iloc[valid_index]
        y_train_fold, y_valid_fold = y.iloc[train_index], y.iloc[valid_index]

        # Target Encoding
        X_train_fold = target_enc.fit_transform(X_train_fold, y_train_fold)
        X_valid_fold = target_enc.transform(X_valid_fold)
        X_test_enc = target_enc.transform(test_df)

        # Handle class imbalance with SMOTE
        sm = SMOTE(random_state=42)
        X_resampled, y_resampled = sm.fit_resample(X_train_fold, y_train_fold)

        # Scale numerical features
        scaler = StandardScaler()
        X_resampled[numerical_features] = scaler.fit_transform(X_resampled[numerical_features])
        X_valid_fold[numerical_features] = scaler.transform(X_valid_fold[numerical_features])
        X_test_enc[numerical_features] = scaler.transform(X_test_enc[numerical_features])

        # Define LightGBM parameters using new suggest methods
        param = {
            'objective': 'binary',
            'metric': 'auc',
            'boosting_type': 'gbdt',
            'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
            'num_leaves': trial.suggest_int('num_leaves', 20, 300),
            'max_depth': trial.suggest_int('max_depth', 3, 20),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
            'subsample': trial.suggest_float('subsample', 0.4, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 10.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 10.0, log=True),
            'random_state': 42,
            'verbosity': -1,
            'n_jobs': -1
        }

        lgb_train = lgb.Dataset(X_resampled, y_resampled)
        lgb_valid = lgb.Dataset(X_valid_fold, y_valid_fold, reference=lgb_train)

        # Use early_stopping and log_evaluation as callbacks
        pruning_callback = LightGBMPruningCallback(trial, 'auc')
        early_stopping_callback = lgb.early_stopping(stopping_rounds=100, verbose=False)
        log_eval_callback = lgb.log_evaluation(period=0)  # Suppress logging during optimization

        callbacks = [pruning_callback, early_stopping_callback, log_eval_callback]

        # Include valid_names to ensure the validation dataset is named 'valid_0'
        gbm = lgb.train(
            param,
            lgb_train,
            num_boost_round=10000,
            valid_sets=[lgb_train, lgb_valid],
            valid_names=['training', 'valid_0'],  # Specify names here
            callbacks=callbacks
        )

        y_valid_pred = gbm.predict(X_valid_fold, num_iteration=gbm.best_iteration)
        auc = roc_auc_score(y_valid_fold, y_valid_pred)
        aucs.append(auc)

    return np.mean(aucs)

In [22]:
# Optimize hyperparameters using Optuna
study = optuna.create_study(direction='maximize', study_name='lgbm_classifier')
study.optimize(objective, n_trials=50)

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
[I 2024-10-11 22:24:51,029] Trial 36 finished with value: 0.9551187395563778 and parameters: {'learning_rate': 0.04172052179784612, 'num_leaves': 156, 'max_depth': 16, 'min_child_samples': 43, 'subsample': 0.9226285430581957, 'colsample_bytree': 0.6356060730375682, 'reg_alpha': 0.0038021734543391508, 'reg_lambda': 0.00791883717970021}. Best is trial 3 with value: 0.9567422863786295.
[I 2024-10-11 22:24:52,355] Trial 37 pruned. Trial was pruned at iteration 0.
[I 2024-10-11 22:24:54,374] Trial 38 pruned. Trial was pruned at iteration 0.
[I 2024-10-11 22:24:55,143] Trial 39 pruned. Trial was pruned at iteration 0.
[I 2024-10-11 22:24:55,915] Trial 40 pruned. Trial was pruned at iteration 0.
[I 2024-10-11 22:24:56,701] Trial 41 pruned. Trial was pruned at iteration 1.
[I 2024-10-11 22:25:52,112] Trial 42 finished with value: 0.9560642270484383 and parameters: {'learning_rate': 0.08612813964889202, 'num_leaves': 142, 'max_depth': 8, 'min_child_sampl

In [23]:
# Retrieve the best parameters
best_params = study.best_params
best_params['objective'] = 'binary'
best_params['metric'] = 'auc'
best_params['boosting_type'] = 'gbdt'
best_params['random_state'] = 42
best_params['verbosity'] = -1
best_params['n_jobs'] = -1

In [24]:
print('Best Hyperparameters:')
print(best_params)


Best Hyperparameters:
{'learning_rate': 0.023842940611212175, 'num_leaves': 153, 'max_depth': 8, 'min_child_samples': 28, 'subsample': 0.7773974847368514, 'colsample_bytree': 0.5835053697097471, 'reg_alpha': 0.0016183153093089716, 'reg_lambda': 1.7117443986908192, 'objective': 'binary', 'metric': 'auc', 'boosting_type': 'gbdt', 'random_state': 42, 'verbosity': -1, 'n_jobs': -1}


In [25]:
# Train the model with best hyperparameters and make predictions
for fold, (train_index, valid_index) in enumerate(skf.split(X, y)):
    print(f'Fold {fold + 1}')
    X_train_fold, X_valid_fold = X.iloc[train_index], X.iloc[valid_index]
    y_train_fold, y_valid_fold = y.iloc[train_index], y.iloc[valid_index]

    # Target Encoding
    X_train_fold = target_enc.fit_transform(X_train_fold, y_train_fold)
    X_valid_fold = target_enc.transform(X_valid_fold)
    X_test_enc = target_enc.transform(test_df)

    # Handle class imbalance with SMOTE
    sm = SMOTE(random_state=42)
    X_resampled, y_resampled = sm.fit_resample(X_train_fold, y_train_fold)

    # Scale numerical features
    scaler = StandardScaler()
    X_resampled[numerical_features] = scaler.fit_transform(X_resampled[numerical_features])
    X_valid_fold[numerical_features] = scaler.transform(X_valid_fold[numerical_features])
    X_test_enc[numerical_features] = scaler.transform(X_test_enc[numerical_features])

    lgb_train = lgb.Dataset(X_resampled, y_resampled)
    lgb_valid = lgb.Dataset(X_valid_fold, y_valid_fold, reference=lgb_train)

    # Use early_stopping and log_evaluation as callbacks
    early_stopping_callback = lgb.early_stopping(stopping_rounds=100, verbose=False)
    log_eval_callback = lgb.log_evaluation(period=100)

    callbacks = [early_stopping_callback, log_eval_callback]

    # Include valid_names here as well
    gbm = lgb.train(
        best_params,
        lgb_train,
        num_boost_round=10000,
        valid_sets=[lgb_train, lgb_valid],
        valid_names=['training', 'valid_0'],
        callbacks=callbacks
    )

    # Predict on validation set
    y_valid_pred = gbm.predict(X_valid_fold, num_iteration=gbm.best_iteration)
    oof_preds[valid_index] = y_valid_pred

    # Predict on test set
    test_fold_pred = gbm.predict(X_test_enc, num_iteration=gbm.best_iteration)
    test_preds += test_fold_pred / skf.n_splits

Fold 1
[100]	training's auc: 0.989793	valid_0's auc: 0.92822
[200]	training's auc: 0.992893	valid_0's auc: 0.93758
[300]	training's auc: 0.995105	valid_0's auc: 0.944483
[400]	training's auc: 0.996285	valid_0's auc: 0.948415
[500]	training's auc: 0.997053	valid_0's auc: 0.950445
[600]	training's auc: 0.997659	valid_0's auc: 0.951527
[700]	training's auc: 0.998109	valid_0's auc: 0.951977
[800]	training's auc: 0.998484	valid_0's auc: 0.952172
[900]	training's auc: 0.998774	valid_0's auc: 0.952331
Fold 2
[100]	training's auc: 0.988854	valid_0's auc: 0.938867
[200]	training's auc: 0.992139	valid_0's auc: 0.946552
[300]	training's auc: 0.994439	valid_0's auc: 0.95355
[400]	training's auc: 0.995824	valid_0's auc: 0.957456
[500]	training's auc: 0.996869	valid_0's auc: 0.959781
[600]	training's auc: 0.997488	valid_0's auc: 0.960796
[700]	training's auc: 0.997901	valid_0's auc: 0.961197
[800]	training's auc: 0.998214	valid_0's auc: 0.961409
Fold 3
[100]	training's auc: 0.989352	valid_0's auc: 0

In [26]:
# Evaluate the overall model performance
roc_auc = roc_auc_score(y, oof_preds)
print(f'Overall ROC-AUC Score: {roc_auc}')

Overall ROC-AUC Score: 0.956608760409393


In [27]:
# Prepare the submission file
submission_df = pd.DataFrame({'id': test_df.index, 'loan_status': test_preds})
submission_file_path = '/content/loan_approval_submission_optimized.csv'
submission_df.to_csv(submission_file_path, index=False)

In [28]:
# Display the first few rows of the submission file
print(submission_df.head())

      id  loan_status
0  58645     0.993406
1  58646     0.014894
2  58647     0.673531
3  58648     0.010579
4  58649     0.074370
