In [1]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')
import os

folder_path = '/content/drive/My Drive/home-credit-default-risk'
os.listdir(folder_path)

Mounted at /content/drive


['HomeCredit_columns_description.csv',
 'POS_CASH_balance.csv',
 'application_test.csv',
 'application_train.csv',
 'bureau.csv',
 'bureau_balance.csv',
 'credit_card_balance.csv',
 'installments_payments.csv',
 'previous_application.csv',
 'sample_submission.csv']

In [3]:
application_train = pd.read_csv(os.path.join(folder_path, 'application_train.csv'))
application_test = pd.read_csv(os.path.join(folder_path, 'application_test.csv'))
bureau = pd.read_csv(os.path.join(folder_path, 'bureau.csv'))
bureau_balance = pd.read_csv(os.path.join(folder_path, 'bureau_balance.csv'))
POS_CASH_balance = pd.read_csv(os.path.join(folder_path, 'POS_CASH_balance.csv'))
credit_card_balance = pd.read_csv(os.path.join(folder_path, 'credit_card_balance.csv'))
previous_application = pd.read_csv(os.path.join(folder_path, 'previous_application.csv'))
installments_payments = pd.read_csv(os.path.join(folder_path, 'installments_payments.csv'))

In [4]:
print('Size of application_train data', application_train.shape)
print('Size of application_test data', application_test.shape)
print('Size of POS_CASH_balance data', POS_CASH_balance.shape)
print('Size of bureau_balance data', bureau_balance.shape)
print('Size of previous_application data', previous_application.shape)
print('Size of installments_payments data', installments_payments.shape)
print('Size of credit_card_balance data', credit_card_balance.shape)
print('Size of bureau data', bureau.shape)

Size of application_train data (307511, 122)
Size of application_test data (48744, 121)
Size of POS_CASH_balance data (10001358, 8)
Size of bureau_balance data (27299925, 3)
Size of previous_application data (1670214, 37)
Size of installments_payments data (13605401, 8)
Size of credit_card_balance data (3840312, 23)
Size of bureau data (1716428, 17)


In [5]:
from sklearn.preprocessing import LabelEncoder  # Import LabelEncoder

le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in application_train:
    if application_train[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(application_train[col].unique())) <= 2:
            # Train on the training data
            le.fit(application_train[col])
            # Transform both training and testing data
            application_train[col] = le.transform(application_train[col])
            application_test[col] = le.transform(application_test[col])

            # Keep track of how many columns were label encoded
            le_count += 1

print('%d columns were label encoded.' % le_count)

3 columns were label encoded.


In [6]:
# one-hot encoding of categorical variables
application_train = pd.get_dummies(application_train)
application_test = pd.get_dummies(application_test)

print('Training Features shape: ', application_train.shape)
print('Testing Features shape: ', application_test.shape)

Training Features shape:  (307511, 243)
Testing Features shape:  (48744, 239)


In [7]:
# Make a new dataframe for polynomial features
poly_features = application_train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'TARGET']]
poly_features_test = application_test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]

# Import SimpleImputer for handling missing values
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')

# Separate the target variable
poly_target = poly_features['TARGET']

# Drop the target column from the feature set
poly_features = poly_features.drop(columns=['TARGET'])

# Impute missing values
poly_features = imputer.fit_transform(poly_features)
poly_features_test = imputer.transform(poly_features_test)

# Import PolynomialFeatures and create the polynomial object
from sklearn.preprocessing import PolynomialFeatures
poly_transformer = PolynomialFeatures(degree=3)

In [8]:
# Train the polynomial features
poly_transformer.fit(poly_features)

# Transform the features
poly_features = poly_transformer.transform(poly_features)
poly_features_test = poly_transformer.transform(poly_features_test)
print('Polynomial Features shape: ', poly_features.shape)

Polynomial Features shape:  (307511, 35)


In [9]:
# Generate the feature names for the polynomial features
feature_names = poly_transformer.get_feature_names_out(input_features=['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'])

# Display the first 15 feature names
print(feature_names[:15])

['1' 'EXT_SOURCE_1' 'EXT_SOURCE_2' 'EXT_SOURCE_3' 'DAYS_BIRTH'
 'EXT_SOURCE_1^2' 'EXT_SOURCE_1 EXT_SOURCE_2' 'EXT_SOURCE_1 EXT_SOURCE_3'
 'EXT_SOURCE_1 DAYS_BIRTH' 'EXT_SOURCE_2^2' 'EXT_SOURCE_2 EXT_SOURCE_3'
 'EXT_SOURCE_2 DAYS_BIRTH' 'EXT_SOURCE_3^2' 'EXT_SOURCE_3 DAYS_BIRTH'
 'DAYS_BIRTH^2']


In [10]:
app_train_domain = application_train.copy()
app_test_domain = application_test.copy()

app_train_domain['CREDIT_INCOME_PERCENT'] = app_train_domain['AMT_CREDIT'] / app_train_domain['AMT_INCOME_TOTAL']
app_train_domain['ANNUITY_INCOME_PERCENT'] = app_train_domain['AMT_ANNUITY'] / app_train_domain['AMT_INCOME_TOTAL']
app_train_domain['CREDIT_TERM'] = app_train_domain['AMT_ANNUITY'] / app_train_domain['AMT_CREDIT']
app_train_domain['DAYS_EMPLOYED_PERCENT'] = app_train_domain['DAYS_EMPLOYED'] / app_train_domain['DAYS_BIRTH']

In [11]:
app_test_domain['CREDIT_INCOME_PERCENT'] = app_test_domain['AMT_CREDIT'] / app_test_domain['AMT_INCOME_TOTAL']
app_test_domain['ANNUITY_INCOME_PERCENT'] = app_test_domain['AMT_ANNUITY'] / app_test_domain['AMT_INCOME_TOTAL']
app_test_domain['CREDIT_TERM'] = app_test_domain['AMT_ANNUITY'] / app_test_domain['AMT_CREDIT']
app_test_domain['DAYS_EMPLOYED_PERCENT'] = app_test_domain['DAYS_EMPLOYED'] / app_test_domain['DAYS_BIRTH']

In [12]:
train_labels = application_train['TARGET']

# Align the training and testing data, keep only columns present in both dataframes
application_train, application_test = application_train.align(application_test, join = 'inner', axis = 1)

# Add the target back in
application_train['TARGET'] = train_labels

print('Training Features shape: ', application_train.shape)
print('Testing Features shape: ', application_test.shape)

Training Features shape:  (307511, 240)
Testing Features shape:  (48744, 239)


In [13]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

# Dropping the target from the training data
if 'TARGET' in application_train:
    train = application_train.drop(columns=['TARGET'])
else:
    train = application_train.copy()

# Feature names
features = list(train.columns)

# Copy of the testing data
test = application_test.copy()

# Median imputation of missing values
imputer = SimpleImputer(strategy='median')

# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range=(0, 1))

# Fit on the training data
imputer.fit(train)

# Transform both training and testing data
train = imputer.transform(train)
test = imputer.transform(test)

# Repeat with the scaler
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

Training data shape:  (307511, 239)
Testing data shape:  (48744, 239)


In [14]:
from sklearn.model_selection import train_test_split
# Ensure train_labels is aligned with the processed train data
train_labels = application_train['TARGET']

# Randomly sample 100,000 rows from the training dataset
sampled_indices = pd.Series(range(train.shape[0])).sample(n=10000, random_state=42)
train_sampled = train[sampled_indices]
labels_sampled = train_labels.iloc[sampled_indices]

# Perform an 80-20 split on the sampled data
X_train, X_valid, y_train, y_valid = train_test_split(train_sampled, labels_sampled, test_size=0.2, random_state=42)

print("Sampled Training Data Shape: ", X_train.shape)
print("Sampled Validation Data Shape: ", X_valid.shape)

Sampled Training Data Shape:  (8000, 239)
Sampled Validation Data Shape:  (2000, 239)


**Baseline Model AUC Scores**

1. Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
# Train the logistic regression model on the training split
log_reg = LogisticRegression(C=1)
log_reg.fit(X_train, y_train)

# Predict probabilities for the validation set
y_valid_pred = log_reg.predict_proba(X_valid)[:, 1]

# Calculate AUC on the validation set
auc = roc_auc_score(y_valid, y_valid_pred)
print(f'Validation AUC: {auc:.4f}')

Validation AUC: 0.7339


2. Logistic Regression with Lasso (L1) Regularization

In [16]:
# Logistic Regression with Lasso (L1 Regularization)
lasso_model = LogisticRegression(penalty='l1', C=1.0, solver='liblinear', random_state=42)
lasso_model.fit(X_train, y_train)

# Predictions
lasso_predictions = lasso_model.predict_proba(X_valid)[:, 1]

# Calculate AUC
lasso_auc = roc_auc_score(y_valid, lasso_predictions)
print(f'Lasso Logistic Regression AUC: {lasso_auc:.4f}')

Lasso Logistic Regression AUC: 0.7348


3. Logistic Regression with Ridge (L2) Regularization

In [17]:
# Logistic Regression with Ridge (L2 Regularization)
ridge_model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear', random_state=42)
ridge_model.fit(X_train, y_train)

# Predictions
ridge_predictions = ridge_model.predict_proba(X_valid)[:, 1]

# Calculate AUC
ridge_auc = roc_auc_score(y_valid, ridge_predictions)
print(f'Ridge Logistic Regression AUC: {ridge_auc:.4f}')

Ridge Logistic Regression AUC: 0.7336


4. Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier
# Create the Random Forest model
random_forest = RandomForestClassifier(n_estimators=100, random_state=50, verbose=1, n_jobs=-1)

# Train on the training split
random_forest.fit(X_train, y_train)

# Predict probabilities for the validation set
y_valid_pred = random_forest.predict_proba(X_valid)[:, 1]

# Calculate AUC on the validation set
auc = roc_auc_score(y_valid, y_valid_pred)
print(f'\nValidation AUC: {auc:.4f}')

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s



Validation AUC: 0.6770


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


In [19]:
from sklearn.model_selection import RandomizedSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 500, 1000],       # Number of trees
    'max_depth': [10, 20, 30, None],            # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],            # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],              # Minimum samples required at each leaf node
    'max_features': ['sqrt', 'log2', None]      # Number of features to consider when looking for the best split
}

# Randomized search with 3-fold cross-validation
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=50, n_jobs=-1),
    param_distributions=param_grid,
    n_iter=20,             # Number of parameter combinations to try
    cv=3,                  # 3-fold cross-validation
    scoring='roc_auc',     # Use AUC as the scoring metric
    random_state=42,
    verbose=2,
    n_jobs=-1
)

# Fit the random search model
random_search.fit(X_train, y_train)

# Print the best parameters and AUC
print(f'Best Parameters: {random_search.best_params_}')
print(f'Best AUC from cross-validation: {random_search.best_score_:.4f}')

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters: {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 10}
Best AUC from cross-validation: 0.6928


In [20]:
# Get the best model
best_model = random_search.best_estimator_

# Predict probabilities on the validation set
y_valid_pred = best_model.predict_proba(X_valid)[:, 1]

# Calculate AUC on the validation set
auc = roc_auc_score(y_valid, y_valid_pred)
print(f'Validation AUC with the best model: {auc:.4f}')

Validation AUC with the best model: 0.7239


5. SVM

In [21]:
from sklearn.svm import SVC
# Create the SVM model with probability outputs enabled
svm_model = SVC(kernel='rbf', probability=True, random_state=42)  # RBF kernel for non-linear SVM
svm_model.fit(X_train, y_train)

# Predict probabilities for the validation set
svm_predictions = svm_model.predict_proba(X_valid)[:, 1]

# Calculate AUC
svm_auc = roc_auc_score(y_valid, svm_predictions)
print(f'SVM AUC: {svm_auc:.4f}')

SVM AUC: 0.6677


In [22]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.01, 0.1, 1, 10],
    'kernel': ['rbf']
}

# Create the SVM model
svm = SVC(probability=True, random_state=42)

# Randomized search with 3-fold cross-validation
random_search = RandomizedSearchCV(
    estimator=svm,
    param_distributions=param_grid,
    n_iter=10,               # Number of parameter combinations to try
    scoring='roc_auc',       # Use AUC as the scoring metric
    cv=3,                    # 3-fold cross-validation
    random_state=42,
    verbose=2,
    n_jobs=-1
)

# Fit the random search model
random_search.fit(X_train, y_train)

# Best parameters and AUC
print(f'Best Parameters: {random_search.best_params_}')


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'kernel': 'rbf', 'gamma': 0.1, 'C': 1}


In [23]:
# Evaluate on validation set
best_model = random_search.best_estimator_
svm_predictions = best_model.predict_proba(X_valid)[:, 1]
svm_auc = roc_auc_score(y_valid, svm_predictions)
print(f'Validation AUC with the best model: {svm_auc:.4f}')

Validation AUC with the best model: 0.6735


XGBoost

In [24]:
from xgboost import XGBClassifier

# Create the XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='auc', random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Predict probabilities for the validation set
xgb_predictions = xgb_model.predict_proba(X_valid)[:, 1]

# Calculate AUC
xgb_auc = roc_auc_score(y_valid, xgb_predictions)
print(f'XGBoost AUC: {xgb_auc:.4f}')

Parameters: { "use_label_encoder" } are not used.



XGBoost AUC: 0.6740


In [25]:
# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 1.5, 2]
}

# Randomized search with 3-fold cross-validation
random_search = RandomizedSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='auc', random_state=42),
    param_distributions=param_grid,
    n_iter=20,  # Number of parameter combinations to try
    scoring='roc_auc',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the random search model
random_search.fit(X_train, y_train)

# Best parameters and AUC
print(f'Best Parameters: {random_search.best_params_}')


Fitting 3 folds for each of 20 candidates, totalling 60 fits


Parameters: { "use_label_encoder" } are not used.



Best Parameters: {'subsample': 0.6, 'reg_lambda': 2, 'reg_alpha': 0.1, 'n_estimators': 500, 'max_depth': 7, 'learning_rate': 0.01, 'gamma': 0.1, 'colsample_bytree': 1.0}


In [26]:
# Evaluate the best model on the validation set
best_model = random_search.best_estimator_
xgb_predictions = best_model.predict_proba(X_valid)[:, 1]
xgb_auc = roc_auc_score(y_valid, xgb_predictions)
print(f'Validation AUC with the best model: {xgb_auc:.4f}')

Validation AUC with the best model: 0.7334


AdaBoost

In [28]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Create the AdaBoost model with a Decision Tree base estimator
estimator = DecisionTreeClassifier(max_depth=1, random_state=42)  # Weak learner
adaboost_model = AdaBoostClassifier(estimator=estimator, random_state=42)

# Train the model
adaboost_model.fit(X_train, y_train)

# Predict probabilities for the validation set
adaboost_predictions = adaboost_model.predict_proba(X_valid)[:, 1]

# Calculate AUC
adaboost_auc = roc_auc_score(y_valid, adaboost_predictions)
print(f'AdaBoost AUC: {adaboost_auc:.4f}')



AdaBoost AUC: 0.7004


In [29]:
# Define parameter grid for AdaBoost
param_grid = {
    'n_estimators': [50, 100, 200, 500],  # Number of boosting rounds
    'learning_rate': [0.01, 0.1, 0.5, 1.0],  # Shrinks contribution of each base estimator
    'estimator__max_depth': [1, 2, 3]  # Depth of the decision tree (weak learner)
}

# Randomized search with 3-fold cross-validation
random_search = RandomizedSearchCV(
    estimator=AdaBoostClassifier(estimator=DecisionTreeClassifier(random_state=42), random_state=42),
    param_distributions=param_grid,
    n_iter=20,  # Number of parameter combinations to try
    scoring='roc_auc',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the random search model
random_search.fit(X_train, y_train)

# Print the best parameters
print(f'Best Parameters: {random_search.best_params_}')

Fitting 3 folds for each of 20 candidates, totalling 60 fits




Best Parameters: {'n_estimators': 50, 'learning_rate': 0.1, 'estimator__max_depth': 1}


In [30]:
# Evaluate the best model on the validation set
best_model = random_search.best_estimator_
adaboost_predictions = best_model.predict_proba(X_valid)[:, 1]
adaboost_auc = roc_auc_score(y_valid, adaboost_predictions)
print(f'Validation AUC with the best model: {adaboost_auc:.4f}')

Validation AUC with the best model: 0.7310
