In [1]:
#STANDARD LIBRARIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#MACHINE LEARNING LIBRARIES
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [3]:
#IMPORT DATASETS
df = pd.read_csv('../input/credi-card-approval/cc_approvals.data')
print(df)

     b  30.83       0  u  g   w   v  1.25  t t.1  01  f g.1  00202  0.1  +
0    a  58.67   4.460  u  g   q   h  3.04  t   t   6  f   g  00043  560  +
1    a  24.50   0.500  u  g   q   h  1.50  t   f   0  f   g  00280  824  +
2    b  27.83   1.540  u  g   w   v  3.75  t   t   5  t   g  00100    3  +
3    b  20.17   5.625  u  g   w   v  1.71  t   f   0  f   s  00120    0  +
4    b  32.08   4.000  u  g   m   v  2.50  t   f   0  t   g  00360    0  +
..  ..    ...     ... .. ..  ..  ..   ... ..  ..  .. ..  ..    ...  ... ..
684  b  21.08  10.085  y  p   e   h  1.25  f   f   0  f   g  00260    0  -
685  a  22.67   0.750  u  g   c   v  2.00  f   t   2  t   g  00200  394  -
686  a  25.25  13.500  y  p  ff  ff  2.00  f   t   1  t   g  00200    1  -
687  b  17.92   0.205  u  g  aa   v  0.04  f   f   0  f   g  00280  750  -
688  b  35.00   3.375  u  g   c   h  8.29  f   f   0  t   g  00000    0  -

[689 rows x 16 columns]


In [4]:
header_list=[]

for col in range(len(df.columns)):
    c = "CC{}".format(col+1)
    header_list.append(c)

df = pd.read_csv('../input/credi-card-approval/cc_approvals.data', header=None, names=header_list)
print(df)

    CC1    CC2     CC3 CC4 CC5 CC6 CC7   CC8 CC9 CC10  CC11 CC12 CC13   CC14  \
0     b  30.83   0.000   u   g   w   v  1.25   t    t     1    f    g  00202   
1     a  58.67   4.460   u   g   q   h  3.04   t    t     6    f    g  00043   
2     a  24.50   0.500   u   g   q   h  1.50   t    f     0    f    g  00280   
3     b  27.83   1.540   u   g   w   v  3.75   t    t     5    t    g  00100   
4     b  20.17   5.625   u   g   w   v  1.71   t    f     0    f    s  00120   
..   ..    ...     ...  ..  ..  ..  ..   ...  ..  ...   ...  ...  ...    ...   
685   b  21.08  10.085   y   p   e   h  1.25   f    f     0    f    g  00260   
686   a  22.67   0.750   u   g   c   v  2.00   f    t     2    t    g  00200   
687   a  25.25  13.500   y   p  ff  ff  2.00   f    t     1    t    g  00200   
688   b  17.92   0.205   u   g  aa   v  0.04   f    f     0    f    g  00280   
689   b  35.00   3.375   u   g   c   h  8.29   f    f     0    t    g  00000   

     CC15 CC16  
0       0    +  
1    

In [5]:
#EXPLORATORY DATA ANALYSIS
print(df.info(), "\n")

print(df.describe(), "\n")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   CC1     690 non-null    object 
 1   CC2     690 non-null    object 
 2   CC3     690 non-null    float64
 3   CC4     690 non-null    object 
 4   CC5     690 non-null    object 
 5   CC6     690 non-null    object 
 6   CC7     690 non-null    object 
 7   CC8     690 non-null    float64
 8   CC9     690 non-null    object 
 9   CC10    690 non-null    object 
 10  CC11    690 non-null    int64  
 11  CC12    690 non-null    object 
 12  CC13    690 non-null    object 
 13  CC14    690 non-null    object 
 14  CC15    690 non-null    int64  
 15  CC16    690 non-null    object 
dtypes: float64(2), int64(2), object(12)
memory usage: 86.4+ KB
None 

              CC3         CC8       CC11           CC15
count  690.000000  690.000000  690.00000     690.000000
mean     4.758725    2.223406    2.40000    10

In [6]:
#INSPECTING NULL VALUES
for col in df.columns:
    if df[col].dtype == 'object':
        print((df[df[col].str.isalpha()==False][col]).value_counts())
        print("\n---------------------")

?    12
Name: CC1, dtype: int64

---------------------
?        12
22.67     9
20.42     7
18.83     6
24.50     6
         ..
48.25     1
28.33     1
18.75     1
18.50     1
36.42     1
Name: CC2, Length: 350, dtype: int64

---------------------
?    6
Name: CC4, dtype: int64

---------------------
?    6
Name: CC5, dtype: int64

---------------------
?    9
Name: CC6, dtype: int64

---------------------
?    9
Name: CC7, dtype: int64

---------------------
Series([], Name: CC9, dtype: int64)

---------------------
Series([], Name: CC10, dtype: int64)

---------------------
Series([], Name: CC12, dtype: int64)

---------------------
Series([], Name: CC13, dtype: int64)

---------------------
00000    132
00200     35
00120     35
00160     34
00100     30
        ... 
00021      1
00393      1
00395      1
00093      1
00256      1
Name: CC14, Length: 171, dtype: int64

---------------------
-    383
+    307
Name: CC16, dtype: int64

---------------------


In [7]:
#REPLACING '?' TO NaN
df = df.replace('?', np.NaN)

print("FIND MISSNG VALUES")
print(df.isna().sum())

for col in df.columns:
    if df[col].dtype == 'object':
        df = df.fillna(df[col].value_counts().index[0])
     
print("\n")
print("REPLACED MISSING VALUES")
print(df.isna().sum())

FIND MISSNG VALUES
CC1     12
CC2     12
CC3      0
CC4      6
CC5      6
CC6      9
CC7      9
CC8      0
CC9      0
CC10     0
CC11     0
CC12     0
CC13     0
CC14    13
CC15     0
CC16     0
dtype: int64


REPLACED MISSING VALUES
CC1     0
CC2     0
CC3     0
CC4     0
CC5     0
CC6     0
CC7     0
CC8     0
CC9     0
CC10    0
CC11    0
CC12    0
CC13    0
CC14    0
CC15    0
CC16    0
dtype: int64


In [8]:
#SPLIT FEATURES AND TARGET
X = df.drop(["CC16"], axis=1)
y = df["CC16"] 

In [9]:
#REPLACING OBJECT VALUES TO NUMERICAL VALUES
le = LabelEncoder()

for col in X.columns:
    if X[col].dtype == 'object':
        X[col]=le.fit_transform(X[col])
print(X)

     CC1  CC2     CC3  CC4  CC5  CC6  CC7   CC8  CC9  CC10  CC11  CC12  CC13  \
0      1  156   0.000    2    1   13    8  1.25    1     1     1     0     0   
1      0  328   4.460    2    1   11    4  3.04    1     1     6     0     0   
2      0   89   0.500    2    1   11    4  1.50    1     0     0     0     0   
3      1  125   1.540    2    1   13    8  3.75    1     1     5     1     0   
4      1   43   5.625    2    1   13    8  1.71    1     0     0     0     2   
..   ...  ...     ...  ...  ...  ...  ...   ...  ...   ...   ...   ...   ...   
685    1   52  10.085    3    3    5    4  1.25    0     0     0     0     0   
686    0   71   0.750    2    1    2    8  2.00    0     1     2     1     0   
687    0   97  13.500    3    3    6    3  2.00    0     1     1     1     0   
688    1   20   0.205    2    1    0    8  0.04    0     0     0     0     0   
689    1  197   3.375    2    1    2    4  8.29    0     0     0     1     0   

     CC14  CC15  
0      68     0  
1  

In [10]:
y.replace({"+": 1, "-": 0}, inplace=True)

print(y)

0      1
1      1
2      1
3      1
4      1
      ..
685    0
686    0
687    0
688    0
689    0
Name: CC16, Length: 690, dtype: int64


In [11]:
#SPLIT THE DATASET TO TRAINING AND TESTING
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 123)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(483, 15)
(207, 15)
(483,)
(207,)


In [12]:
#BUILD LOGISTIC REGRESSION CLASSIFIER
scaler = MinMaxScaler()
logreg = LogisticRegression(random_state=123)

model = make_pipeline(scaler, logreg)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# LOGISTIC REGRESSION CLASSIFIER METRIC SCORES
print("Counfusion Matrix: \n",confusion_matrix (y_test, y_pred),"\n")
print("Classification Report: \n", classification_report (y_test, y_pred), "\n")
print("Accuracy Score:  %0.3f \n" %accuracy_score(y_test,y_pred), "\n")

Counfusion Matrix: 
 [[96 13]
 [13 85]] 

Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.88      0.88       109
           1       0.87      0.87      0.87        98

    accuracy                           0.87       207
   macro avg       0.87      0.87      0.87       207
weighted avg       0.87      0.87      0.87       207
 

Accuracy Score:  0.874 
 



In [13]:
#PERFORM HYPER PARAMETER OPTIMIZATION ON LOGISTIC REGRESSION CLASSIFIER
param_grid = {"solver" : ['newton-cg', 'lbfgs', 'liblinear'],
              "penalty" : ['l2'],
              "C" : [100, 10, 1.0, 0.1, 0.01],
              "tol" : [0.1, 0.01, 0.001, 0.0001, 0.00001],
              "max_iter" : [100,150,200]}

scaler = MinMaxScaler()
logreg = LogisticRegression(random_state = 123)
grid_search = GridSearchCV (logreg, param_grid = param_grid, cv=5, scoring='accuracy',error_score=0)

model = make_pipeline(scaler, grid_search)

grid_result = model.fit(X_train, y_train)

y_pred = model.predict(X_test)

#LOGISTIC REGRESSION CLASSIFIER HYPER PARAMETER SCORES
print ("Accuracy Score: %0.3f \n" %grid_result.score(X_test, y_test))
gs = model.steps[1][1]
print ("Best Parameters: \n", gs.best_params_, "\n")
print ("Best Score: %0.3f \n"%gs.best_score_)

Accuracy Score: 0.879 

Best Parameters: 
 {'C': 10, 'max_iter': 100, 'penalty': 'l2', 'solver': 'newton-cg', 'tol': 0.1} 

Best Score: 0.863 



In [14]:
#BUILD DECISION TREE CLASSIFIER
scaler = MinMaxScaler()
dtc = DecisionTreeClassifier(random_state=123)

model = make_pipeline(scaler, dtc)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)


#DECISION TREE CLASSIFIER METRIC SCORES
print("Counfusion Matrix: \n",confusion_matrix (y_test, y_pred),"\n")
print("Classification Report: \n", classification_report (y_test, y_pred), "\n")
print("Accuracy Score:  %0.3f \n" %accuracy_score(y_test,y_pred), "\n")

Counfusion Matrix: 
 [[96 13]
 [21 77]] 

Classification Report: 
               precision    recall  f1-score   support

           0       0.82      0.88      0.85       109
           1       0.86      0.79      0.82        98

    accuracy                           0.84       207
   macro avg       0.84      0.83      0.83       207
weighted avg       0.84      0.84      0.84       207
 

Accuracy Score:  0.836 
 



In [15]:
#PERFORM HYPER PARAMETER OPTIMIZATION ON DECISION TREE CLASSIFIER
param_grid = {"criterion" : ['gini','entropy'],
              "max_features": ['log2', 'sqrt','auto'],
              "splitter" : ['best','random'],
              "max_depth" : [1, 2, 3, 4, 5, 10, 20,50],
              "min_samples_leaf" : [10, 20, 30, 40, 50]}

scaler = MinMaxScaler()
dtc = DecisionTreeClassifier(random_state=123)
grid_search = GridSearchCV (dtc, param_grid = param_grid, cv=5, scoring='accuracy',error_score=0)

model = make_pipeline(scaler, grid_search)

grid_result = model.fit(X_train, y_train)

y_pred = model.predict(X_test)

#DECISION TREE CLASSIFIER HYPER PARAMETER SCORES
print ("Accuracy Score: %0.3f \n" %grid_result.score(X_test, y_test))
gs = model.steps[1][1]
print ("Best Parameters: \n", gs.best_params_, "\n")
print ("Best Score: %0.3f \n"%gs.best_score_)

Accuracy Score: 0.884 

Best Parameters: 
 {'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 20, 'splitter': 'best'} 

Best Score: 0.859 



In [16]:
#BUILD GAUSSIAN NB CLASSIFIER
scaler = MinMaxScaler()
gnb = GaussianNB()

model = make_pipeline(scaler, gnb)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)


#GAUSSIAN CLASSIFIER NB CLASSIFIER METRIC SCORES
print("Counfusion Matrix: \n",confusion_matrix (y_test, y_pred),"\n")
print("Classification Report: \n", classification_report (y_test, y_pred), "\n")
print("Accuracy Score:  %0.3f \n" %accuracy_score(y_test,y_pred), "\n")

Counfusion Matrix: 
 [[104   5]
 [ 30  68]] 

Classification Report: 
               precision    recall  f1-score   support

           0       0.78      0.95      0.86       109
           1       0.93      0.69      0.80        98

    accuracy                           0.83       207
   macro avg       0.85      0.82      0.83       207
weighted avg       0.85      0.83      0.83       207
 

Accuracy Score:  0.831 
 



In [17]:
#BUILD SUPPORT VECTOR MACHINE CLASSIFIER
scaler = MinMaxScaler()
svm = SVC()

model = make_pipeline(scaler, svm)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)


#SUPPORT VECTOR MACHINE CLASSIFIER METRIC SCORES
print("Counfusion Matrix: \n",confusion_matrix (y_test, y_pred),"\n")
print("Classification Report: \n", classification_report (y_test, y_pred), "\n")
print("Accuracy Score:  %0.3f \n" %accuracy_score(y_test,y_pred), "\n")

Counfusion Matrix: 
 [[96 13]
 [11 87]] 

Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.88      0.89       109
           1       0.87      0.89      0.88        98

    accuracy                           0.88       207
   macro avg       0.88      0.88      0.88       207
weighted avg       0.88      0.88      0.88       207
 

Accuracy Score:  0.884 
 



In [18]:
#PERFORM HYPER PARAMETER OPTIMIZATION ON SUPPORT VECTOR MACHINE CLASSIFIER
param_grid = {"kernel" : ['linear', 'poly', 'rbf', 'sigmoid'],
              "C" : [100,50, 10, 1.0, 0.1, 0.001]}

scaler = MinMaxScaler()
svm = SVC()
grid_search = GridSearchCV (svm, param_grid = param_grid, cv=5, scoring='accuracy',error_score=0)

model = make_pipeline(scaler, grid_search)

grid_result = model.fit(X_train, y_train)

y_pred = model.predict(X_test)

#SUPPORT VECTOR MACHINE CLASSIFIER HYPER PARAMETER SCORES
print ("Accuracy Score: %0.3f \n" %grid_result.score(X_test, y_test))
gs = model.steps[1][1]
print ("Best Parameters: \n", gs.best_params_, "\n")
print ("Best Score: %0.3f \n"%gs.best_score_)

Accuracy Score: 0.884 

Best Parameters: 
 {'C': 1.0, 'kernel': 'rbf'} 

Best Score: 0.865 



In [19]:
#BUILD RANDOM FOREST CLASSIFIER
scaler = MinMaxScaler()
rand = RandomForestClassifier(random_state=123)

model = make_pipeline(scaler, rand)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

#RANDOM FOREST CLASSIFIER METRIC SCORES
print("Counfusion Matrix: \n",confusion_matrix (y_test, y_pred),"\n")
print("Classification Report: \n", classification_report (y_test, y_pred), "\n")
print("Accuracy Score:  %0.3f \n" %accuracy_score(y_test,y_pred), "\n")

Counfusion Matrix: 
 [[101   8]
 [ 14  84]] 

Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.93      0.90       109
           1       0.91      0.86      0.88        98

    accuracy                           0.89       207
   macro avg       0.90      0.89      0.89       207
weighted avg       0.89      0.89      0.89       207
 

Accuracy Score:  0.894 
 



In [20]:
#PERFORM HYPER PARAMETER OPTIMIZATION ON RANDOM FOREST CLASSIFIER
param_grid = {"max_depth" : [3, 10 ,20],
              "max_features": ['sqrt', 'log2'],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

scaler = MinMaxScaler()
rand = RandomForestClassifier(random_state=123)
grid_search = GridSearchCV (rand, param_grid = param_grid, cv=5,scoring='accuracy',error_score=0)

model = make_pipeline(scaler, grid_search)

grid_result = model.fit(X_train, y_train)

y_pred = model.predict(X_test)

#RANDOM FOREST CLASSIFIER HYPER PARAMETER SCORES
print ("Accuracy Score: %0.3f \n" %grid_result.score(X_test, y_test))
gs = model.steps[1][1]
print ("Best Parameters: \n", gs.best_params_, "\n")
print ("Best Score: %0.3f \n"%gs.best_score_)

Accuracy Score: 0.865 

Best Parameters: 
 {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 3, 'max_features': 'sqrt'} 

Best Score: 0.859 



In [21]:
#BUILD ADC CLASSIFIER
scaler = MinMaxScaler()
adc = AdaBoostClassifier(random_state=123)

model = make_pipeline(scaler, adc)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

#ADC CLASSIFIER METRIC SCORES
print("Counfusion Matrix: \n",confusion_matrix (y_test, y_pred),"\n")
print("Classification Report: \n", classification_report (y_test, y_pred), "\n")
print("Accuracy Score:  %0.3f \n" %accuracy_score(y_test,y_pred), "\n")

Counfusion Matrix: 
 [[95 14]
 [16 82]] 

Classification Report: 
               precision    recall  f1-score   support

           0       0.86      0.87      0.86       109
           1       0.85      0.84      0.85        98

    accuracy                           0.86       207
   macro avg       0.86      0.85      0.85       207
weighted avg       0.86      0.86      0.85       207
 

Accuracy Score:  0.855 
 



In [22]:
#BUILD GBM CLASSIFIER
scaler = MinMaxScaler()
gbc = GradientBoostingClassifier(random_state=123)

model = make_pipeline(scaler, gbc)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

#GBM CLASSIFIER METRIC SCORES
print("Counfusion Matrix: \n",confusion_matrix (y_test, y_pred),"\n")
print("Classification Report: \n", classification_report (y_test, y_pred), "\n")
print("Accuracy Score:  %0.3f \n" %accuracy_score(y_test,y_pred), "\n")

Counfusion Matrix: 
 [[93 16]
 [15 83]] 

Classification Report: 
               precision    recall  f1-score   support

           0       0.86      0.85      0.86       109
           1       0.84      0.85      0.84        98

    accuracy                           0.85       207
   macro avg       0.85      0.85      0.85       207
weighted avg       0.85      0.85      0.85       207
 

Accuracy Score:  0.850 
 



In [23]:
#BUILD VOTING CLASSIFIER
scaler = MinMaxScaler()
logreg = LogisticRegression(random_state=123)
dtc = DecisionTreeClassifier(random_state=123)
gnb = GaussianNB()
svm = SVC()
rand = RandomForestClassifier(random_state=123)
adc = AdaBoostClassifier(random_state=123)
gbc = GradientBoostingClassifier(random_state=123)

vc =VotingClassifier(estimators=[('logreg',logreg),('dtc',dtc),('gnb',gnb),('svm',svm),('rand',rand),('adc',adc),('gbc',gbc)],voting='hard')

model = make_pipeline(scaler, vc)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

#VOTING CLASSIFIER METRIC SCORES
print("Counfusion Matrix: \n",confusion_matrix (y_test, y_pred),"\n")
print("Classification Report: \n", classification_report (y_test, y_pred), "\n")
print("Accuracy Score:  %0.3f \n" %accuracy_score(y_test,y_pred), "\n")

Counfusion Matrix: 
 [[97 12]
 [14 84]] 

Classification Report: 
               precision    recall  f1-score   support

           0       0.87      0.89      0.88       109
           1       0.88      0.86      0.87        98

    accuracy                           0.87       207
   macro avg       0.87      0.87      0.87       207
weighted avg       0.87      0.87      0.87       207
 

Accuracy Score:  0.874 
 

