In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e10/sample_submission.csv
/kaggle/input/playground-series-s4e10/train.csv
/kaggle/input/playground-series-s4e10/test.csv


In [2]:
import pandas as pd

train_data = pd.read_csv('/kaggle/input/playground-series-s4e10/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv')

# Display the first few rows of the train dataset
train_data.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


In [3]:
# Step 1: Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Step 2: Load the Data
#train_data = pd.read_csv('/kaggle/input/playground-series-s4e10/train.csv')
#test_data = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv')

# Step 3: Data Preprocessing
# List of categorical columns
categorical_columns = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']

# Encode categorical variables using Label Encoding
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col])
    test_data[col] = le.transform(test_data[col])
    label_encoders[col] = le

# List of numerical columns to be scaled
numerical_columns = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 
                     'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']

# Standardize the numerical features
scaler = StandardScaler()
train_data[numerical_columns] = scaler.fit_transform(train_data[numerical_columns])
test_data[numerical_columns] = scaler.transform(test_data[numerical_columns])

# Step 4: Split the Training Data
X = train_data.drop(columns=['id', 'loan_status'])
y = train_data['loan_status']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)




In [4]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import time  # Importing the time module

# Step 3: Define a Function for Model Training with Hyperparameter Tuning using GridSearchCV
def train_model_with_grid_search(model, param_grid, X_train, y_train, model_name):
    search = GridSearchCV(
        model,
        param_grid=param_grid,
        scoring='accuracy',
        cv=3,
        verbose=1,
        n_jobs=-1  # Utilize all CPU cores
    )
    
    start_time = time.time()  # Start timing
    search.fit(X_train, y_train)
    end_time = time.time()  # End timing
    
    print(f'Best parameters for {model_name}: {search.best_params_}')
    print(f'Time taken for {model_name}: {end_time - start_time:.2f} seconds')  # Print time taken
    return search.best_estimator_

# Example Parameter Grids for GridSearchCV
xgb_param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.05, 0.1]
}

lgb_param_grid = {
    'n_estimators': [50, 100, 150],
    'num_leaves': [20, 30, 40],
    'learning_rate': [0.01, 0.05, 0.1]
}

cat_param_grid = {
    'iterations': [50, 100, 150],
    'depth': [3, 4, 5],
    'learning_rate': [0.01, 0.05, 0.1]
}

# Step 4: Train Models with GridSearchCV
best_models = {}

# XGBoost Classifier
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
best_models['XGBoost'] = train_model_with_grid_search(xgb_model, xgb_param_grid, X_train, y_train, 'XGBoost')

# LightGBM Classifier
lgb_model = LGBMClassifier(random_state=42)
best_models['LightGBM'] = train_model_with_grid_search(lgb_model, lgb_param_grid, X_train, y_train, 'LightGBM')

# CatBoost Classifier
cat_model = CatBoostClassifier(random_state=42, verbose=0)
best_models['CatBoost'] = train_model_with_grid_search(cat_model, cat_param_grid, X_train, y_train, 'CatBoost')

# Step 5: Evaluate All Tuned Models
for model_name, model in best_models.items():
    model_pred = model.predict(X_val)
    model_accuracy = accuracy_score(y_val, model_pred)
    print(f'{model_name} Accuracy after Hyperparameter Tuning: {model_accuracy:.2%}')


Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150}
Time taken for XGBoost: 18.32 seconds
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[LightGBM] [Info] Number of positive: 4472, number of negative: 26805
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023443 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 876
[LightGBM] [Info] Number of data points in the train set: 31277, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142980 -> initscore=-1.790753
[LightGBM] [Info] Start training from score -1.790753
[LightGBM] [Info] Number of positive: 4472, number of negative: 26805
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022974 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bin

In [5]:
# Train models
best_models = {}
best_models['XGBoost'] = train_model_with_grid_search(XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'), xgb_param_grid, X_train, y_train, 'XGBoost')
best_models['LightGBM'] =train_model_with_grid_search(LGBMClassifier(random_state=42), lgb_param_grid, X_train, y_train, 'LightGBM')
best_models['CatBoost'] = train_model_with_grid_search(CatBoostClassifier(random_state=42, verbose=0), cat_param_grid, X_train, y_train, 'CatBoost')

# Step 5: Evaluate All Tuned Models
xgb_pred = best_models['XGBoost'].predict(X_val)
lgb_pred = best_models['LightGBM'].predict(X_val)
cat_pred = best_models['CatBoost'].predict(X_val)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150}
Time taken for XGBoost: 43.70 seconds
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[LightGBM] [Info] Number of positive: 4472, number of negative: 26805
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007941 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 876
[LightGBM] [Info] Number of data points in the train set: 31277, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142980 -> initscore=-1.790753
[LightGBM] [Info] Start training from score -1.790753
[LightGBM] [Info] Number of positive: 4472, number of negative: 26805
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008040 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bin

In [6]:
# Step 6: Blend Predictions
# List to store the results
best_accuracy = 0
best_weights = (0, 0, 0)

# Iterate over a range of weights for XGBoost, LightGBM, and CatBoost
for w1 in np.arange(0, 1.1, 0.1):  # Weights for XGBoost
    for w2 in np.arange(0, 1.1 - w1, 0.1):  # Weights for LightGBM
        w3 = 1.0 - w1 - w2  # Weight for CatBoost to ensure weights sum to 1
        
        # Check if w3 is valid (should be non-negative)
        if w3 < 0:
            continue  # Skip this iteration if w3 is negative
        
        # Generate the blended predictions
        blend_pred = (w1 * xgb_pred) + (w2 * lgb_pred) + (w3 * cat_pred)
        blend_pred = np.round(blend_pred).astype(int)
        
        # Calculate the accuracy of the blended predictions
        blend_accuracy = accuracy_score(y_val, blend_pred)
        
        # Update the best weights if the current accuracy is higher
        if blend_accuracy > best_accuracy:
            best_accuracy = blend_accuracy
            best_weights = (w1, w2, w3)

        # Print the accuracy score at each step for monitoring
        print(f'Weights: XGBoost={w1:.1f}, LightGBM={w2:.1f}, CatBoost={w3:.1f} -> Accuracy: {blend_accuracy:.4f}')

# Print the best weights and the corresponding accuracy
print(f'\nBest Weights: XGBoost={best_weights[0]:.2f}, LightGBM={best_weights[1]:.2f}, CatBoost={best_weights[2]:.2f}')
print(f'Best Blended Model Accuracy: {best_accuracy:.4f}')

Weights: XGBoost=0.0, LightGBM=0.0, CatBoost=1.0 -> Accuracy: 0.9516
Weights: XGBoost=0.0, LightGBM=0.1, CatBoost=0.9 -> Accuracy: 0.9516
Weights: XGBoost=0.0, LightGBM=0.2, CatBoost=0.8 -> Accuracy: 0.9516
Weights: XGBoost=0.0, LightGBM=0.3, CatBoost=0.7 -> Accuracy: 0.9516
Weights: XGBoost=0.0, LightGBM=0.4, CatBoost=0.6 -> Accuracy: 0.9516
Weights: XGBoost=0.0, LightGBM=0.5, CatBoost=0.5 -> Accuracy: 0.9519
Weights: XGBoost=0.0, LightGBM=0.6, CatBoost=0.4 -> Accuracy: 0.9534
Weights: XGBoost=0.0, LightGBM=0.7, CatBoost=0.3 -> Accuracy: 0.9534
Weights: XGBoost=0.0, LightGBM=0.8, CatBoost=0.2 -> Accuracy: 0.9534
Weights: XGBoost=0.0, LightGBM=0.9, CatBoost=0.1 -> Accuracy: 0.9534
Weights: XGBoost=0.0, LightGBM=1.0, CatBoost=0.0 -> Accuracy: 0.9534
Weights: XGBoost=0.1, LightGBM=0.0, CatBoost=0.9 -> Accuracy: 0.9516
Weights: XGBoost=0.1, LightGBM=0.1, CatBoost=0.8 -> Accuracy: 0.9516
Weights: XGBoost=0.1, LightGBM=0.2, CatBoost=0.7 -> Accuracy: 0.9516
Weights: XGBoost=0.1, LightGBM=0.3

In [7]:
# Step 7: Make Final Predictions on Test Data
# Assuming test_data is preprocessed the same way as the training data
X_test = test_data.drop(columns=['id'])  # Drop 'id' if it exists
# Generate predictions using the best models
xgb_test_pred = best_models['XGBoost'].predict(X_test)
lgb_test_pred = best_models['LightGBM'].predict(X_test)
cat_test_pred = best_models['CatBoost'].predict(X_test)

# Blend predictions for the test set using the best weights found
final_blend_pred = (best_weights[0] * xgb_test_pred) + (best_weights[1] * lgb_test_pred) + (best_weights[2] * cat_test_pred)
final_blend_pred = np.round(final_blend_pred).astype(int)  # Corrected line

# Step 8: Create Submission File
# Load the IDs from the test data
submission_df = pd.DataFrame({
    'id': test_data['id'],  # Make sure to include the 'id' column
    'loan_status': final_blend_pred
})

# Display the first few rows of the submission DataFrame
print(submission_df.head())

# Save the DataFrame to a CSV file
submission_df.to_csv('final_submission.csv', index=False)


      id  loan_status
0  58645            1
1  58646            0
2  58647            0
3  58648            0
4  58649            0
