In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


# Task
Analyze the Home Credit Default Risk dataset to predict loan defaults using Logistic Regression and CatBoost models, define business costs for false positives and false negatives, and optimize the decision threshold to minimize total business cost.

## Data loading and initial exploration

### Subtask:
Load the necessary datasets and perform an initial exploration to understand the data structure, identify missing values, and understand the features.


**Reasoning**:
Load the train and test datasets, display the head and shape of both dataframes, and then get information and descriptive statistics for the training data, and finally check for missing values in the training data.



In [5]:
import pandas as pd
import numpy as np

# Load the datasets
try:
    application_train = pd.read_csv('/content/drive/MyDrive/credit_risk_dataset.csv')
except FileNotFoundError:
    print("Make sure the datasets are in the correct directory: '/content/drive/MyDrive/HomeCredit/'")
    application_train = None
    application_test = None

if application_train is not None and application_test is not None:
    # Display the first 5 rows of the application_train DataFrame
    print("First 5 rows of application_train:")
    display(application_train.head())

    # Print the shape of both DataFrames
    print("\nShape of application_train:", application_train.shape)
    print("Shape of application_test:", application_test.shape)

    # Use .info() on application_train
    print("\nInfo of application_train:")
    application_train.info()

    # Use .describe() on application_train
    print("\nDescription of application_train:")
    display(application_train.describe())

    # Check for the percentage of missing values in each column of application_train
    missing_values = application_train.isnull().sum()
    missing_values_percentage = (missing_values / len(application_train)) * 100

    # Display columns with a significant percentage of missing values (e.g., > 20%)
    significant_missing_values = missing_values_percentage[missing_values_percentage > 20].sort_values(ascending=False)
    print("\nColumns with significant missing values in application_train (> 20%):")
    display(significant_missing_values)

## Data Preprocessing

### Subtask:
Clean and preprocess the data. This will likely involve handling missing values, encoding categorical features, and scaling numerical features. Merge any necessary dataframes.

**Reasoning**:
First, I'll address missing values by imputing or removing them. Then, I'll identify categorical and numerical features to apply appropriate encoding and scaling techniques. Finally, I'll merge any additional dataframes that are relevant for the analysis based on the dataset documentation if applicable.

In [7]:
# Handle missing values
# For numerical features, we can impute with the median or mean
# For categorical features, we can impute with the mode or a placeholder

# Identify numerical and categorical columns
numerical_cols = application_train.select_dtypes(include=np.number).columns
categorical_cols = application_train.select_dtypes(include='object').columns

# Create copies to avoid SettingWithCopyWarning
application_train_processed = application_train.copy()

# Impute missing values in numerical columns with the median
for col in numerical_cols:
    if application_train_processed[col].isnull().sum() > 0:
        median_val = application_train_processed[col].median()
        application_train_processed[col] = application_train_processed[col].fillna(median_val)

# Impute missing values in categorical columns with the mode
for col in categorical_cols:
    if application_train_processed[col].isnull().sum() > 0:
        mode_val = application_train_processed[col].mode()[0]
        application_train_processed[col] = application_train_processed[col].fillna(mode_val)


# Verify that there are no more missing values
print("\nMissing values after imputation:")
display(application_train_processed.isnull().sum().sum())

# Encode categorical features
application_train_processed = pd.get_dummies(application_train_processed, columns=categorical_cols, drop_first=True)

# Scale numerical features (excluding the target variable if it's in numerical_cols)
# Assuming 'loan_status' is the target variable
numerical_cols_to_scale = [col for col in numerical_cols if col != 'loan_status']

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
application_train_processed[numerical_cols_to_scale] = scaler.fit_transform(application_train_processed[numerical_cols_to_scale])

print("\nData after preprocessing:")
display(application_train_processed.head())

# Update the original dataframe variable name to reflect the processing
application_train = application_train_processed


Missing values after imputation:


np.int64(0)


Data after preprocessing:


Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_OTHER,person_home_ownership_OWN,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_Y
0,-0.903374,-0.114143,28.926614,4.019404,1.625921,1,3.931411,-0.691554,False,False,...,False,True,False,False,False,True,False,False,False,True
1,-1.060904,-0.911147,0.056763,-1.35865,0.04231,0,-0.657458,-0.938167,False,True,...,False,False,False,True,False,False,False,False,False,False
2,-0.430783,-0.911147,-0.921876,-0.646849,0.603713,1,3.74411,-0.691554,False,False,...,True,False,False,False,True,False,False,False,False,False
3,-0.745843,-0.009274,-0.187897,4.019404,1.369558,1,3.369508,-0.938167,False,False,...,True,False,False,False,True,False,False,False,False,False
4,-0.588313,-0.188358,0.790742,4.019404,1.058028,1,3.556809,-0.444942,False,False,...,True,False,False,False,True,False,False,False,False,True


## Feature Engineering

### Subtask:
Create new features that could improve the model's performance. This might include aggregating information from related tables or creating interaction terms.

**Reasoning**:
Based on the dataset documentation, I will create new features such as ratios and interaction terms from existing columns that could be indicative of loan default risk.

In [8]:
# Example Feature Engineering: Create a ratio of loan amount to income
application_train['loan_income_ratio'] = application_train['loan_amnt'] / application_train['person_income']

# Example Feature Engineering: Create an interaction term for age and employment length
application_train['age_emp_interaction'] = application_train['person_age'] * application_train['person_emp_length']

# Display the new features
print("Data with new features:")
display(application_train[['loan_income_ratio', 'age_emp_interaction']].head())

Data with new features:


Unnamed: 0,loan_income_ratio,age_emp_interaction
0,-35.213667,-26.131546
1,1.491143,-0.06022
2,0.709928,0.397128
3,-433.386139,0.140142
4,-21.339151,-0.465204


## Model Training

### Subtask:
Train binary classification models (e.g., Logistic Regression, CatBoost) on the preprocessed data.

**Reasoning**:
I will split the data into training and testing sets and then train a Logistic Regression model and a CatBoost model.

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

# Define features (X) and target (y)
X = application_train.drop('loan_status', axis=1)
y = application_train['loan_status']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train Logistic Regression model
log_reg = LogisticRegression(solver='liblinear', random_state=42)
log_reg.fit(X_train, y_train)

# Train CatBoost model
catboost = CatBoostClassifier(iterations=100, learning_rate=0.1, depth=6, random_state=42, verbose=0)
catboost.fit(X_train, y_train)

print("Logistic Regression model trained.")
print("CatBoost model trained.")

Logistic Regression model trained.
CatBoost model trained.
