<a href="https://colab.research.google.com/github/techwithlik/LoanPrediction/blob/main/LoanPrediction_Stripped.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install catboost



In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, KBinsDiscretizer
from sklearn.feature_selection import f_classif
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
import warnings
warnings.filterwarnings('ignore')
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
# Load the data
data = pd.read_csv('credit_data_train.csv', encoding='utf8')
# X contains all columns except the 'class' column
X = data.drop(columns=['class'])
# y contains the label 'class' column
y = data['class']

In [None]:
# Encode ordinal features
ordinal_mappings = {
    'current_account_balance': {'< 0 GBP': 1, 'No current account': 2, '0 - 160 GBP': 3, '>= 160 GBP': 4},
    'credit_history': {'Outstanding credits existing': 1, 'Delay in paying off in the past': 2, 'Existing credits paid back': 3, 'No credits taken/All credits paid back': 3, 'All credits at this bank paid back': 3},
    'savings_account_balance': {'Unknown/No savings account': 1, '< 80 GBP': 2, '80 - 400 GBP': 3, '400 - 800 GBP': 4, '>= 800 GBP': 5},
    'length_of_employment': {'Unemployed': 1, '< 1 year': 2, '1-4 years': 3, '4-7 years': 4, '>= 7 years': 5},
    'most_valuable_asset': {'No assets': 1, 'Savings Account/Life Insurance': 2, 'Car or Other': 3, 'Real Estate': 4},
    'employment': {'Unskilled': 1, 'Unemployed': 2, 'Professionals': 3, 'Experienced professionals': 4},
}

# Apply mapping to ordinal features
for feature in ordinal_mappings:
    X[feature] = X[feature].map(ordinal_mappings[feature])

In [None]:
# Separate ordinal and nominal columns
numerical_features = list(X.select_dtypes(include=['int64', 'float64']).columns)
other_features = list(X.select_dtypes(include=['object', 'bool']).columns)

# Create preprocessors for ordinal and nominal features separately
numerical_transformer = SimpleImputer(strategy='median')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Apply ColumnTransformer to preprocess features
preprocessor = ColumnTransformer(transformers=[
    ('numerical', numerical_transformer, numerical_features),
    ('categorical', categorical_transformer, other_features)
])

# Transform the columns and put the column name back
X_preprocessed = preprocessor.fit_transform(X)

In [None]:
# Apply binning to the numerical features
binning_age = KBinsDiscretizer(n_bins=6, encode='ordinal', strategy='quantile')
binning_loan_duration = KBinsDiscretizer(n_bins=7, encode='ordinal', strategy='quantile')
binning_loan_value = KBinsDiscretizer(n_bins=7, encode='ordinal', strategy='quantile')

# Fit and transform the binned features
X_age_binned = binning_age.fit_transform(X_preprocessed[:, numerical_features.index('age')].reshape(-1, 1))
X_loan_duration_binned = binning_loan_duration.fit_transform(X_preprocessed[:, numerical_features.index('loan_duration')].reshape(-1, 1))
X_loan_value_binned = binning_loan_value.fit_transform(X_preprocessed[:, numerical_features.index('loan_value')].reshape(-1, 1))

# Replace the original features with the binned features
X_binned = np.copy(X_preprocessed)
X_binned[:, numerical_features.index('age')] = X_age_binned.ravel()
X_binned[:, numerical_features.index('loan_duration')] = X_loan_duration_binned.ravel()
X_binned[:, numerical_features.index('loan_value')] = X_loan_value_binned.ravel()

# Resample the training set to address class imbalance
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X_binned, y)

In [None]:
# Initialize ExtraTreesClassifier
et_clf = ExtraTreesClassifier(random_state=18)

# Fit the classifier on your training data (X_resampled and y_resampled)
et_clf.fit(X_resampled, y_resampled)

# Use SelectFromModel to select top 25 features based on feature importances
sfm = SelectFromModel(et_clf, threshold=-np.inf, max_features=25)
sfm.fit(X_resampled, y_resampled)

# Transform your training data to include only the selected features
X_selected = sfm.transform(X_resampled)

In [None]:
# Tuned results
param = {
    'iterations': 100,
    'depth': 6,
    'learning_rate': 0.12,
    'l2_leaf_reg': 7,
    'subsample': 0.9
}

# Create CatBoostClassifier instance
model_cb = CatBoostClassifier(iterations=param['iterations'],
                              depth=param['depth'],
                              learning_rate=param['learning_rate'],
                              l2_leaf_reg=param['l2_leaf_reg'],
                              subsample=param['subsample'],
                              silent=True, random_state=18)

# Train the model on the resampled data
model_cb.fit(X_selected, y_resampled)

score_cb = cross_val_score(model_cb, X_selected, y_resampled, scoring='roc_auc', cv=5).mean()
print(f"CatBoost's ROC AUC score: {score_cb:.2%}")

CatBoost's ROC AUC score: 94.31%


In [None]:
# Load the data
X_test = pd.read_csv('credit_data_test.csv', encoding='utf8', index_col=0)

# Apply mapping to ordinal features
for feature in ordinal_mappings:
    X_test[feature] = X_test[feature].map(ordinal_mappings[feature])

X_test_preprocessed = preprocessor.transform(X_test)

# Apply binning to the numerical features for the test set
X_test_age_binned = binning_age.transform(X_test_preprocessed[:, numerical_features.index('age')].reshape(-1, 1))
X_test_loan_duration_binned = binning_loan_duration.transform(X_test_preprocessed[:, numerical_features.index('loan_duration')].reshape(-1, 1))
X_test_loan_value_binned = binning_loan_value.transform(X_test_preprocessed[:, numerical_features.index('loan_value')].reshape(-1, 1))

# Replace the original features with the binned features for the test set
X_test_binned = np.copy(X_test_preprocessed)
X_test_binned[:, numerical_features.index('age')] = X_test_age_binned.ravel()
X_test_binned[:, numerical_features.index('loan_duration')] = X_test_loan_duration_binned.ravel()
X_test_binned[:, numerical_features.index('loan_value')] = X_test_loan_value_binned.ravel()

# Use SelectFromModel to select top features for the test set
X_test_selected = sfm.transform(X_test_binned)

# Make predictions on the test set
test_results = model_cb.predict(X_test_selected)

# Prepare submission DataFrame
submission_df = pd.read_csv("./sample_submission.csv", index_col="id")
submission_df['class'] = test_results
# Save the submission file
submission_df.to_csv('my_submission.csv', index=True)

submission_df.head()

Unnamed: 0_level_0,class
id,Unnamed: 1_level_1
0,1
1,1
2,0
3,1
4,1
