# Credit card applications
Commercial banks receive a lot of applications for credit cards. Many of them get rejected for many reasons, like high loan balances, low income levels, or too many inquiries on an individual's credit report, for example. Manually analyzing these applications is mundane, error-prone, and time-consuming (and time is money!). Luckily, this task can be automated with the power of machine learning and pretty much every commercial bank does so nowadays. In this notebook, we will build an automatic credit card approval predictor using machine learning techniques, just like the real banks do.

The dataset used in this project is the Credit Card Approval dataset from the UCI Machine Learning Repository.
http://archive.ics.uci.edu/dataset/27/credit+approval

http://rstudio-pubs-static.s3.amazonaws.com/73039_9946de135c0a49daa7a0a9eda4a67a72.html

In [43]:
import numpy as np
import pandas as pd

In [44]:
cc_data = pd.read_csv("cc_approvals.data")
cc_data.head()


Unnamed: 0,b,30.83,0,u,g,w,v,1.25,t,t.1,01,f,g.1,00202,0.1,+
0,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
1,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
2,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+
4,b,32.08,4.0,u,g,m,v,2.5,t,f,0,t,g,360,0,+


In [45]:
#approach 1
# Load dataset
cc_data = pd.read_csv("cc_approvals.data", header=None)

# Inspect data
cc_data.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [48]:
#approach 2
header_names=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P']
df =  pd.read_csv("cc_approvals.data", names=header_names)
df.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


# EDA

In [49]:
cc_data.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64

In [50]:

# Print DataFrame information
cc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       690 non-null    object 
 1   1       690 non-null    object 
 2   2       690 non-null    float64
 3   3       690 non-null    object 
 4   4       690 non-null    object 
 5   5       690 non-null    object 
 6   6       690 non-null    object 
 7   7       690 non-null    float64
 8   8       690 non-null    object 
 9   9       690 non-null    object 
 10  10      690 non-null    int64  
 11  11      690 non-null    object 
 12  12      690 non-null    object 
 13  13      690 non-null    object 
 14  14      690 non-null    int64  
 15  15      690 non-null    object 
dtypes: float64(2), int64(2), object(12)
memory usage: 86.4+ KB


In [51]:
# Print summary statistics
cc_data.describe()


Unnamed: 0,2,7,10,14
count,690.0,690.0,690.0,690.0
mean,4.758725,2.223406,2.4,1017.385507
std,4.978163,3.346513,4.86294,5210.102598
min,0.0,0.0,0.0,0.0
25%,1.0,0.165,0.0,0.0
50%,2.75,1.0,0.0,5.0
75%,7.2075,2.625,3.0,395.5
max,28.0,28.5,67.0,100000.0


In [None]:
'?', np.nan, NULL, NaN

In [52]:
# for each of the categorical columns, lets see the unique values
for i in cc_data.columns:
    #print(i)
    if cc_data[i].dtype==object:
        print(cc_data[i].unique())
    print('---------------')

['b' 'a' '?']
---------------
['30.83' '58.67' '24.50' '27.83' '20.17' '32.08' '33.17' '22.92' '54.42'
 '42.50' '22.08' '29.92' '38.25' '48.08' '45.83' '36.67' '28.25' '23.25'
 '21.83' '19.17' '25.00' '47.75' '27.42' '41.17' '15.83' '47.00' '56.58'
 '57.42' '42.08' '29.25' '42.00' '49.50' '36.75' '22.58' '27.25' '23.00'
 '27.75' '54.58' '34.17' '28.92' '29.67' '39.58' '56.42' '54.33' '41.00'
 '31.92' '41.50' '23.92' '25.75' '26.00' '37.42' '34.92' '34.25' '23.33'
 '23.17' '44.33' '35.17' '43.25' '56.75' '31.67' '23.42' '20.42' '26.67'
 '36.00' '25.50' '19.42' '32.33' '34.83' '38.58' '44.25' '44.83' '20.67'
 '34.08' '21.67' '21.50' '49.58' '27.67' '39.83' '?' '37.17' '25.67'
 '34.00' '49.00' '62.50' '31.42' '52.33' '28.75' '28.58' '22.50' '28.50'
 '37.50' '35.25' '18.67' '54.83' '40.92' '19.75' '29.17' '24.58' '33.75'
 '25.42' '37.75' '52.50' '57.83' '20.75' '39.92' '24.75' '44.17' '23.50'
 '47.67' '22.75' '34.42' '28.42' '67.75' '47.42' '36.25' '32.67' '48.58'
 '33.58' '18.83' '26.92' 

Handling the missing values

In [53]:
cc_data = cc_data.replace('?',np.nan)


In [54]:
cc_data.isnull().sum()

0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64

In [55]:
def fix_missing_mean(temp,col):
    ''' This function takes a data frame as input 
    replaces the missing values of a particular column with it's mean value
    '''
    #replace missing values with mean 
    temp[col] = pd.to_numeric(temp[col], errors = 'coerce')
    print(col, temp[col].mean())
    temp[col].fillna(temp[col].mean(), inplace = True)    

def fix_missing_mode(temp, col):
    print(col, temp[col].mode())
    temp[col].fillna(temp[col].mode().iloc[0], inplace = True)    
    

In [56]:
df=cc_data.copy()

In [57]:
#part 1
fix_missing_mode(df,0)
fix_missing_mode(df,3)
fix_missing_mode(df,4)
fix_missing_mode(df,5)
fix_missing_mode(df,6)

fix_missing_mean(df,1)
fix_missing_mean(df,13)

0 0    b
Name: 0, dtype: object
3 0    u
Name: 3, dtype: object
4 0    g
Name: 4, dtype: object
5 0    c
Name: 5, dtype: object
6 0    v
Name: 6, dtype: object
1 31.56817109144543
13 184.01477104874445


In [58]:
#part 1
df.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64

# Splitting the dataset

In [59]:
# Separate target from features
y = df[15]
features = df.drop([15], axis=1)
# Preview features
features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0


Convert categorical column into numerical

In [60]:
from sklearn.preprocessing import OrdinalEncoder
# List of categorical columns
object_cols = [0,1,3,4,5,6,8,9,11,12]

# ordinal-encode categorical columns
X = features.copy()
ordinal_encoder = OrdinalEncoder()
X[object_cols] = ordinal_encoder.fit_transform(features[object_cols])

# Preview the ordinal-encoded features
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1.0,156.0,0.0,1.0,0.0,12.0,7.0,1.25,1.0,1.0,1,0.0,0.0,202.0,0
1,0.0,329.0,4.46,1.0,0.0,10.0,3.0,3.04,1.0,1.0,6,0.0,0.0,43.0,560
2,0.0,89.0,0.5,1.0,0.0,10.0,3.0,1.5,1.0,0.0,0,0.0,0.0,280.0,824
3,1.0,125.0,1.54,1.0,0.0,12.0,7.0,3.75,1.0,1.0,5,1.0,0.0,100.0,3
4,1.0,43.0,5.625,1.0,0.0,12.0,7.0,1.71,1.0,0.0,0,0.0,2.0,120.0,0


In [61]:
df[15].value_counts()

15
-    383
+    307
Name: count, dtype: int64

In [62]:
from sklearn.model_selection import train_test_split

xTrain, xTest, yTrain, yTest = train_test_split(X, y,
                                               test_size=0.25,
                                               random_state=2)


In [63]:
xTrain.shape, xTest.shape, yTrain.shape, yTest.shape

((517, 15), (173, 15), (517,), (173,))

# Model Developmment

In [64]:
# Import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(feature_range=(0,1))
rescaledxTrain = scaler.fit_transform(xTrain)
rescaledxTest = scaler.fit_transform(xTest)



In [70]:
rescaledxTrain[5]

array([1.        , 0.12931034, 0.26785714, 0.        , 0.        ,
       0.61538462, 0.875     , 0.05263158, 1.        , 1.        ,
       0.01492537, 0.        , 0.        , 0.08      , 0.00457926])

In [68]:
yTrain

637    -
315    -
165    +
370    -
322    +
      ..
534    -
584    +
493    +
527    -
168    +
Name: 15, Length: 517, dtype: object

In [66]:
# Import LogisticRegression
from sklearn.linear_model import LogisticRegression 
# Instantiate a LogisticRegression classifier with default parameter values
logreg = LogisticRegression()

# Fit logreg to the train set
logreg.fit(rescaledxTrain, yTrain)

LogisticRegression()

In [67]:
from sklearn.metrics import confusion_matrix
# Use logreg to predict instances from the test set and store it
y_pred = logreg.predict(rescaledxTest)
y_pred1 = logreg.predict(rescaledxTrain)

# Get the accuracy score of logreg model and print it
print("Test: Accuracy = ", logreg.score(rescaledxTest,yTest))
print("Train: Accuracy = ", logreg.score(rescaledxTrain,yTrain))

# Print the confusion matrix of the logreg model
confusion_matrix(yTest,y_pred)


Test: Accuracy =  0.8670520231213873
Train: Accuracy =  0.8588007736943907


array([[75,  9],
       [14, 75]], dtype=int64)