![Credit card being held in hand](credit_card.jpg)

Commercial banks receive _a lot_ of applications for credit cards. Many of them get rejected for many reasons, like high loan balances, low income levels, or too many inquiries on an individual's credit report, for example. Manually analyzing these applications is mundane, error-prone, and time-consuming (and time is money!). Luckily, this task can be automated with the power of machine learning and pretty much every commercial bank does so nowadays. In this workbook, you will build an automatic credit card approval predictor using machine learning techniques, just like real banks do.

### The Data

The data is a small subset of the Credit Card Approval dataset from the UCI Machine Learning Repository showing the credit card applications a bank receives. This dataset has been loaded as a `pandas` DataFrame called `cc_apps`. The last column in the dataset is the target value.

In [64]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report


# Load the dataset
cc_apps = pd.read_csv("cc_approvals.data", header=None) 
cc_apps.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,g,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,g,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,g,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,g,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,s,0,+


In [65]:
cc_apps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       690 non-null    object 
 1   1       690 non-null    object 
 2   2       690 non-null    float64
 3   3       690 non-null    object 
 4   4       690 non-null    object 
 5   5       690 non-null    object 
 6   6       690 non-null    object 
 7   7       690 non-null    float64
 8   8       690 non-null    object 
 9   9       690 non-null    object 
 10  10      690 non-null    int64  
 11  11      690 non-null    object 
 12  12      690 non-null    int64  
 13  13      690 non-null    object 
dtypes: float64(2), int64(2), object(10)
memory usage: 75.6+ KB


In [66]:
cc_apps.describe()

Unnamed: 0,2,7,10,12
count,690.0,690.0,690.0,690.0
mean,4.758725,2.223406,2.4,1017.385507
std,4.978163,3.346513,4.86294,5210.102598
min,0.0,0.0,0.0,0.0
25%,1.0,0.165,0.0,0.0
50%,2.75,1.0,0.0,5.0
75%,7.2075,2.625,3.0,395.5
max,28.0,28.5,67.0,100000.0


In [67]:
# Create a DF copy
cc_apps_copy = cc_apps

In [68]:
# Identify the categorical columns
categorical_columns = cc_apps.select_dtypes(include=['object']).columns

# Identify the numerical columns
numeric_columns = cc_apps.select_dtypes(include=['float64','int64']).columns

In [69]:
# Initialize the MinMaxScaler
scaler = StandardScaler()

# Scale the numeric columns
cc_apps_copy[numeric_columns] = scaler.fit_transform(cc_apps[numeric_columns])

# Display the first few rows of the updated DataFrame
cc_apps_copy.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,b,30.83,-0.956613,u,g,w,v,-0.291083,t,t,-0.288101,g,-0.195413,+
1,a,58.67,-0.060051,u,g,q,h,0.24419,t,t,0.74083,g,-0.087852,+
2,a,24.5,-0.856102,u,g,q,h,-0.216324,t,f,-0.493887,g,-0.037144,+
3,b,27.83,-0.647038,u,g,w,v,0.456505,t,t,0.535044,g,-0.194837,+
4,b,20.17,0.174141,u,g,w,v,-0.153526,t,f,-0.493887,s,-0.195413,+


In [70]:
cc_apps_copy.describe()

Unnamed: 0,2,7,10,12
count,690.0,690.0,690.0,690.0
mean,2.0595440000000003e-17,8.238177000000001e-17,-2.0595440000000003e-17,-2.0595440000000003e-17
std,1.000725,1.000725,1.000725,1.000725
min,-0.9566132,-0.6648767,-0.4938866,-0.1954133
25%,-0.7555902,-0.6155359,-0.4938866,-0.1954133
50%,-0.4037999,-0.3658414,-0.4938866,-0.194453
75%,0.4922602,0.1200908,0.1234717,-0.1194481
max,4.672031,7.857628,13.29378,19.01199


In [71]:
# Apply one-hot encoding to the categorical columns
cc_apps_encoded = pd.get_dummies(cc_apps_copy, columns=categorical_columns, drop_first=True)
cc_apps_encoded.head()

Unnamed: 0,2,7,10,12,0_a,0_b,1_15.17,1_15.75,1_15.83,1_15.92,1_16.00,1_16.08,1_16.17,1_16.25,1_16.33,1_16.50,1_16.92,1_17.08,1_17.25,1_17.33,1_17.42,1_17.50,1_17.58,1_17.67,1_17.83,1_17.92,1_18.00,1_18.08,1_18.17,1_18.25,1_18.33,1_18.42,1_18.50,1_18.58,1_18.67,1_18.75,1_18.83,1_18.92,1_19.00,1_19.17,...,1_71.58,1_73.42,1_74.83,1_76.75,1_80.25,1_?,3_l,3_u,3_y,4_g,4_gg,4_p,5_aa,5_c,5_cc,5_d,5_e,5_ff,5_i,5_j,5_k,5_m,5_q,5_r,5_w,5_x,6_bb,6_dd,6_ff,6_h,6_j,6_n,6_o,6_v,6_z,8_t,9_t,11_p,11_s,13_-
0,-0.956613,-0.291083,-0.288101,-0.195413,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0
1,-0.060051,0.24419,0.74083,-0.087852,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0
2,-0.856102,-0.216324,-0.493887,-0.037144,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
3,-0.647038,0.456505,0.535044,-0.194837,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0
4,0.174141,-0.153526,-0.493887,-0.195413,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0


In [72]:
# Target variable
target = cc_apps_encoded.iloc[:, -1]

# Feature variables
features = cc_apps_encoded.iloc[:, :-1]

print("Features shape:", features.shape)
print("Target shape:", target.shape)

Features shape: (690, 388)
Target shape: (690,)


In [73]:
# Convert column names to strings
features.columns = features.columns.astype(str)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)



In [74]:
# Define the parameter grid for Logistic Regression
param_grid_log_reg = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

# Define the parameter grid for Decision Tree
param_grid_decision_tree = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Define the parameter grid for XGBoost
param_grid_xgboost = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 0.9]
}

# Initialize the models
log_reg = LogisticRegression(max_iter=1000)
decision_tree = DecisionTreeClassifier(random_state=42)
xgboost = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Initialize GridSearchCV for each model
grid_log_reg = GridSearchCV(estimator=log_reg, param_grid=param_grid_log_reg, cv=5, scoring='accuracy', n_jobs=-1)
grid_decision_tree = GridSearchCV(estimator=decision_tree, param_grid=param_grid_decision_tree, cv=5, scoring='accuracy', n_jobs=-1)
grid_xgboost = GridSearchCV(estimator=xgboost, param_grid=param_grid_xgboost, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the models
grid_log_reg.fit(X_train, y_train)
grid_decision_tree.fit(X_train, y_train)
grid_xgboost.fit(X_train, y_train)

# Get the best parameters and the best score for each model
best_params_log_reg = grid_log_reg.best_params_
best_score_log_reg = grid_log_reg.best_score_

best_params_decision_tree = grid_decision_tree.best_params_
best_score_decision_tree = grid_decision_tree.best_score_

best_params_xgboost = grid_xgboost.best_params_
best_score_xgboost = grid_xgboost.best_score_

# Print the best parameters and scores
print("Best Logistic Regression Params:", best_params_log_reg)
print("Best Logistic Regression Accuracy:", best_score_log_reg)

print("Best Decision Tree Params:", best_params_decision_tree)
print("Best Decision Tree Accuracy:", best_score_decision_tree)

print("Best XGBoost Params:", best_params_xgboost)
print("Best XGBoost Accuracy:", best_score_xgboost)

# Determine which model has the best score
best_model_name = None
best_model_score = 0

if best_score_log_reg > best_model_score:
    best_model_name = 'Logistic Regression'
    best_model_score = best_score_log_reg

if best_score_decision_tree > best_model_score:
    best_model_name = 'Decision Tree'
    best_model_score = best_score_decision_tree

if best_score_xgboost > best_model_score:
    best_model_name = 'XGBoost'
    best_model_score = best_score_xgboost

print(f"The best model is {best_model_name} with an accuracy of {best_model_score:.4f}")

Best Logistic Regression Params: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Best Logistic Regression Accuracy: 0.8912858312858314
Best Decision Tree Params: {'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 2}
Best Decision Tree Accuracy: 0.8495823095823095
Best XGBoost Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Best XGBoost Accuracy: 0.8804586404586404
The best model is Logistic Regression with an accuracy of 0.8913


In [75]:
# Best score
best_score = best_model_score
print(best_score)

0.8912858312858314
