In [1]:
#Importing neccessary libraries.
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

In [2]:
#Read data using pandas read_csv and 1st five rows.
cc_app = pd.read_csv("/home/syedasamreen/syeda/cc_approvals.data",header=None)
cc_app.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [3]:
# Basic discriptive statistics information using describe.
cc_app.describe()

Unnamed: 0,2,7,10,14
count,690.0,690.0,690.0,690.0
mean,4.758725,2.223406,2.4,1017.385507
std,4.978163,3.346513,4.86294,5210.102598
min,0.0,0.0,0.0,0.0
25%,1.0,0.165,0.0,0.0
50%,2.75,1.0,0.0,5.0
75%,7.2075,2.625,3.0,395.5
max,28.0,28.5,67.0,100000.0


In [4]:
#Data type information
cc_app.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       690 non-null    object 
 1   1       690 non-null    object 
 2   2       690 non-null    float64
 3   3       690 non-null    object 
 4   4       690 non-null    object 
 5   5       690 non-null    object 
 6   6       690 non-null    object 
 7   7       690 non-null    float64
 8   8       690 non-null    object 
 9   9       690 non-null    object 
 10  10      690 non-null    int64  
 11  11      690 non-null    object 
 12  12      690 non-null    object 
 13  13      690 non-null    object 
 14  14      690 non-null    int64  
 15  15      690 non-null    object 
dtypes: float64(2), int64(2), object(12)
memory usage: 86.4+ KB


In [8]:
#replacing ? by nan
cc_app = cc_app.replace('?',np.nan)
# check for nan counts
cc_app[4].value_counts()

g     519
p     163
gg      2
Name: 4, dtype: int64

In [11]:
#Fill na value using mean of values and sum the null values.
cc_app.fillna(cc_app.mean(),inplace=True)
print(cc_app.isnull().sum())

0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64


  cc_app.fillna(cc_app.mean(),inplace=True)


In [12]:
#iterate over each column and check the null values
for col in cc_app.columns:
    if cc_app[col].dtype == 'object':
        cc_app = cc_app.fillna(cc_app[col].value_counts().index[0])

print(cc_app.isnull().sum())   

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64


In [13]:
# Normalize the label using label encoder and apply fit transfer method.
le = LabelEncoder()
for col in cc_app.columns.to_numpy():
    if cc_app[col].dtypes == 'object':
        cc_app[col]=le.fit_transform(cc_app[col])

In [14]:
# Split the data into train and test sets using train_test_aplit method.
cc_app = cc_app.drop([11,13],axis =1)
cc_app = cc_app.to_numpy()
X,y = cc_app[:,0:13], cc_app[:,13]
X_train,x_test,Y_train,y_test = train_test_split(X,y,test_size = 0.33,random_state=42)


In [15]:
#Scale the feature values to a uniform range using a mimimaxscalar
scaler = MinMaxScaler(feature_range=(0,1))
scaledx_train = scaler.fit_transform(X_train)
scaledx_test = scaler.transform(x_test)

In [16]:
#Fitting a logistic regression model to the train set
log_reg = LogisticRegression()
log_reg.fit(scaledx_train,Y_train)

LogisticRegression()

In [17]:
# Making predictions and evaluating performance
y_pred = log_reg.predict(scaledx_test)
print("Accuracy of logistic regression",log_reg.score(scaledx_test,y_test))
confusion_matrix(y_test,y_pred)

Accuracy of logistic regression 0.8421052631578947


array([[94,  9],
       [27, 98]])

In [18]:
#perform a grid search of the model parameters to improve the model's ability to predict credit card approvals.
# Define the grid of values for tol and max_iter
tol = [0.01, 0.001 ,0.0001]
max_iter = [100, 150, 200]

# Create a dictionary where tol and max_iter are keys and the lists of their values are the corresponding values
param_grid = dict(tol=tol, max_iter=max_iter)

In [19]:
# Instantiate GridSearchCV with the required parameters
grid_model = GridSearchCV(estimator=log_reg, param_grid=param_grid, cv=5)

# Use scaler to rescale X and assign it to rescaledX
rescaledX = scaler.fit_transform(X)

# Fit grid_model to the data
grid_model_result = grid_model.fit(rescaledX, y)

# Summarize results
best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_
print("Best: %f using %s" % (best_score, best_params))

Best: 0.850725 using {'max_iter': 100, 'tol': 0.01}


In [20]:
"Result: Best_score: 0.850725 using {'max_iter': 100, 'tol': 0.01}"

"Result: Best_score: 0.850725 using {'max_iter': 100, 'tol': 0.01}"