<a href="https://colab.research.google.com/github/svitax/credit_card_applications/blob/master/cc_apps.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [90]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, f1_score

## Loading and viewing the applications

In [98]:
# Load dataset
cc_apps = pd.read_csv("cc_approvals.data", header=None)

# Inspect data
print(cc_apps.head())

  0      1      2  3  4  5  6     7  8  9   10 11 12     13   14 15
0  b  30.83  0.000  u  g  w  v  1.25  t  t   1  f  g  00202    0  +
1  a  58.67  4.460  u  g  q  h  3.04  t  t   6  f  g  00043  560  +
2  a  24.50  0.500  u  g  q  h  1.50  t  f   0  f  g  00280  824  +
3  b  27.83  1.540  u  g  w  v  3.75  t  t   5  t  g  00100    3  +
4  b  20.17  5.625  u  g  w  v  1.71  t  f   0  f  s  00120    0  +


## Likely features

0.   Male          : num  1 1 0 0 0 0 1 0 0 0 ...
1.   Age           : chr  "58.67" "24.50" "27.83" "20.17" ...
2.   Debt          : num  4.46 0.5 1.54 5.62 4 ...
3.   Married       : chr  "u" "u" "u" "u" ...
4.   BankCustomer  : chr  "g" "g" "g" "g" ...
5.   EducationLevel: chr  "q" "q" "w" "w" ...
6.   Ethnicity     : chr  "h" "h" "v" "v" ...
7.   YearsEmployed : num  3.04 1.5 3.75 1.71 2.5 ...
8.   PriorDefault  : num  1 1 1 1 1 1 1 1 1 0 ...
9.   Employed      : num  1 0 1 0 0 0 0 0 0 0 ...
10.  CreditScore   : num  6 0 5 0 0 0 0 0 0 0 ...    
11.  DriversLicense: chr  "f" "f" "t" "f" ...
12.  Citizen       : chr  "g" "g" "g" "s" ... 
13.  ZipCode       : chr  "00043" "00280" "00100" "00120" ... 
14.  Income        : num  560 824 3 0 0 ... 
15.  Approved      : chr  "+" "+" "+" "+" ... 

## Inspecting the applications

In [99]:
# Print summary statistics
cc_apps_description = cc_apps.describe()
print(cc_apps_description)
print("\n")

# Print DataFrame information
cc_apps_info = cc_apps.info()
print(cc_apps_info)

# Inspect missing values in the dataset
print(cc_apps.tail(17))

               2           7          10             14
count  690.000000  690.000000  690.00000     690.000000
mean     4.758725    2.223406    2.40000    1017.385507
std      4.978163    3.346513    4.86294    5210.102598
min      0.000000    0.000000    0.00000       0.000000
25%      1.000000    0.165000    0.00000       0.000000
50%      2.750000    1.000000    0.00000       5.000000
75%      7.207500    2.625000    3.00000     395.500000
max     28.000000   28.500000   67.00000  100000.000000


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       690 non-null    object 
 1   1       690 non-null    object 
 2   2       690 non-null    float64
 3   3       690 non-null    object 
 4   4       690 non-null    object 
 5   5       690 non-null    object 
 6   6       690 non-null    object 
 7   7       690 non-null    float64
 8   8       690 no

## Replace missing values (question marks in this dataset) with NaN.


In [100]:
# Replace the '?'s with NaN
cc_apps = cc_apps.replace("?", np.nan)

# Inspect the missing values again
print(cc_apps.tail(17))

      0      1       2  3  4   5   6      7  8  9   10 11 12     13   14 15
673  NaN  29.50   2.000  y  p   e   h  2.000  f  f   0  f  g  00256   17  -
674    a  37.33   2.500  u  g   i   h  0.210  f  f   0  f  g  00260  246  -
675    a  41.58   1.040  u  g  aa   v  0.665  f  f   0  f  g  00240  237  -
676    a  30.58  10.665  u  g   q   h  0.085  f  t  12  t  g  00129    3  -
677    b  19.42   7.250  u  g   m   v  0.040  f  t   1  f  g  00100    1  -
678    a  17.92  10.210  u  g  ff  ff  0.000  f  f   0  f  g  00000   50  -
679    a  20.08   1.250  u  g   c   v  0.000  f  f   0  f  g  00000    0  -
680    b  19.50   0.290  u  g   k   v  0.290  f  f   0  f  g  00280  364  -
681    b  27.83   1.000  y  p   d   h  3.000  f  f   0  f  g  00176  537  -
682    b  17.08   3.290  u  g   i   v  0.335  f  f   0  t  g  00140    2  -
683    b  36.42   0.750  y  p   d   v  0.585  f  f   0  f  g  00240    3  -
684    b  40.58   3.290  u  g   m   v  3.500  f  f   0  t  s  00400    0  -
685    b  21

## Impute the missing numeric values with the mean value of their respective columns


In [101]:
# Impute the missing values with mean imputation
cc_apps.fillna(cc_apps.mean(), inplace=True)

# Count the number of NaNs in the dataset to verify
print(cc_apps.isnull().sum().sum())

print(cc_apps.tail(17))

67
      0      1       2  3  4   5   6      7  8  9   10 11 12     13   14 15
673  NaN  29.50   2.000  y  p   e   h  2.000  f  f   0  f  g  00256   17  -
674    a  37.33   2.500  u  g   i   h  0.210  f  f   0  f  g  00260  246  -
675    a  41.58   1.040  u  g  aa   v  0.665  f  f   0  f  g  00240  237  -
676    a  30.58  10.665  u  g   q   h  0.085  f  t  12  t  g  00129    3  -
677    b  19.42   7.250  u  g   m   v  0.040  f  t   1  f  g  00100    1  -
678    a  17.92  10.210  u  g  ff  ff  0.000  f  f   0  f  g  00000   50  -
679    a  20.08   1.250  u  g   c   v  0.000  f  f   0  f  g  00000    0  -
680    b  19.50   0.290  u  g   k   v  0.290  f  f   0  f  g  00280  364  -
681    b  27.83   1.000  y  p   d   h  3.000  f  f   0  f  g  00176  537  -
682    b  17.08   3.290  u  g   i   v  0.335  f  f   0  t  g  00140    2  -
683    b  36.42   0.750  y  p   d   v  0.585  f  f   0  f  g  00240    3  -
684    b  40.58   3.290  u  g   m   v  3.500  f  f   0  t  s  00400    0  -
685    b 

## Impute missing non-numeric values with the most frequent value present of their respective columns


In [102]:
# We still have some non-numeric columns that have missing values
# Iterate over each column of cc_apps
for col in cc_apps.columns.to_numpy():
    # Check if the column is of object type
    if cc_apps[col].dtypes == 'object':
        # Impute with the most frequent value
        cc_apps = cc_apps.fillna(cc_apps[col].value_counts().index[0])

# Count the number of NaNs in the dataset and print the counts to verify
print(cc_apps.isnull().sum().sum())

print(cc_apps.tail(17))

0
    0      1       2  3  4   5   6      7  8  9   10 11 12     13   14 15
673  b  29.50   2.000  y  p   e   h  2.000  f  f   0  f  g  00256   17  -
674  a  37.33   2.500  u  g   i   h  0.210  f  f   0  f  g  00260  246  -
675  a  41.58   1.040  u  g  aa   v  0.665  f  f   0  f  g  00240  237  -
676  a  30.58  10.665  u  g   q   h  0.085  f  t  12  t  g  00129    3  -
677  b  19.42   7.250  u  g   m   v  0.040  f  t   1  f  g  00100    1  -
678  a  17.92  10.210  u  g  ff  ff  0.000  f  f   0  f  g  00000   50  -
679  a  20.08   1.250  u  g   c   v  0.000  f  f   0  f  g  00000    0  -
680  b  19.50   0.290  u  g   k   v  0.290  f  f   0  f  g  00280  364  -
681  b  27.83   1.000  y  p   d   h  3.000  f  f   0  f  g  00176  537  -
682  b  17.08   3.290  u  g   i   v  0.335  f  f   0  t  g  00140    2  -
683  b  36.42   0.750  y  p   d   v  0.585  f  f   0  f  g  00240    3  -
684  b  40.58   3.290  u  g   m   v  3.500  f  f   0  t  s  00400    0  -
685  b  21.08  10.085  y  p   e   h 

## Convert non-numeric data into numeric with label encoding

In [103]:
# Instantiate LabelEncoder
le = LabelEncoder()

# Iterate over all the values of each column and extract their dtypes
for col in cc_apps.columns.to_numpy():
    # Compare if the dtype is object
    if cc_apps[col].dtypes == 'object':
        # Use LabelEncoder to do the numeric transformation
        cc_apps[col] = le.fit_transform(cc_apps[col])

# cc_apps = cc_apps.apply(lambda col: le.fit_transform(col), axis=0, result_type='expand')

print(cc_apps.info())

print(cc_apps.tail(17))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       690 non-null    int64  
 1   1       690 non-null    int64  
 2   2       690 non-null    float64
 3   3       690 non-null    int64  
 4   4       690 non-null    int64  
 5   5       690 non-null    int64  
 6   6       690 non-null    int64  
 7   7       690 non-null    float64
 8   8       690 non-null    int64  
 9   9       690 non-null    int64  
 10  10      690 non-null    int64  
 11  11      690 non-null    int64  
 12  12      690 non-null    int64  
 13  13      690 non-null    int64  
 14  14      690 non-null    int64  
 15  15      690 non-null    int64  
dtypes: float64(2), int64(14)
memory usage: 86.4 KB
None
     0    1       2   3   4   5   6      7   8   9   10  11  12   13   14  15
673   1  140   2.000   3   3   5   4  2.000   0   0   0   0   0   89   17   1
674   0  214   2

## Exclude unimportant features


In [104]:
# Drop the features 11 and 13
cc_apps = cc_apps.drop([11, 13], axis=1)
print(cc_apps.info())
print(cc_apps.tail(17))

# cc_apps = cc_apps.to_numpy()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       690 non-null    int64  
 1   1       690 non-null    int64  
 2   2       690 non-null    float64
 3   3       690 non-null    int64  
 4   4       690 non-null    int64  
 5   5       690 non-null    int64  
 6   6       690 non-null    int64  
 7   7       690 non-null    float64
 8   8       690 non-null    int64  
 9   9       690 non-null    int64  
 10  10      690 non-null    int64  
 11  12      690 non-null    int64  
 12  14      690 non-null    int64  
 13  15      690 non-null    int64  
dtypes: float64(2), int64(12)
memory usage: 75.6 KB
None
     0    1       2   3   4   5   6      7   8   9   10  12   14  15
673   1  140   2.000   3   3   5   4  2.000   0   0   0   0   17   1
674   0  214   2.500   2   1   7   4  0.210   0   0   0   0  246   1
675   0  248   1.040   2   1   0   8  0

In [105]:
# Segregate features and labels into separate variables
X, y = cc_apps.iloc[:, 0:13], cc_apps.iloc[:, 13]

print("Variances before scaling:")
print(X.var())
print("\n")

Variances before scaling:
0     2.120275e-01
1     9.252313e+03
2     2.478211e+01
3     2.276120e-01
4     7.398162e-01
5     1.946671e+01
6     6.731461e+00
7     1.119915e+01
8     2.498244e-01
9     2.451042e-01
10    2.364819e+01
12    3.112177e-01
14    2.714517e+07
dtype: float64




## Rescale all the values

In [106]:
# Instantiate StandardScaler and use it to rescale cc_apps
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X),
                       columns=X.columns)

print("Variances after scaling:")
print(X.var())
print("\n")

Variances after scaling:
0     1.001451
1     1.001451
2     1.001451
3     1.001451
4     1.001451
5     1.001451
6     1.001451
7     1.001451
8     1.001451
9     1.001451
10    1.001451
12    1.001451
14    1.001451
dtype: float64




## Splitting the dataset into train and test sets


In [107]:
# Transform into numpy array
cc_apps = cc_apps.to_numpy()

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                y,
                                test_size=0.33,
                                random_state=42)

## Fitting a logistic regression model to the train set


In [108]:
# Instantiate a LogisticRegression classifier with default parameter values
logreg = LogisticRegression()

# Fit logreg to the train set
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Making predictions and evaluating performance

In [109]:
# Use logreg to predict instances from the test set and store it
y_pred = logreg.predict(X_test)

# Get the accuracy score of logreg model and print it
print("Accuracy of logistic regression classifier: ",
      logreg.score(X_test, y_test))

# Print the confusion matrix of the logreg model
print(confusion_matrix(y_test, y_pred))



Accuracy of logistic regression classifier:  0.8508771929824561
[[ 93  10]
 [ 24 101]]


## Grid searching and making the model perform better

In [110]:
# Define the grid of values for tol and max_iter
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]

# Create a dictionary where tol and max_iter are keys and the lists of their values are corresponding values
param_grid = dict(tol=tol, max_iter=max_iter)

## Find the best performing model

In [111]:
# Instantiate GridSearchCV with the required parameters
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

# Fit data to grid_model
grid_model_result = grid_model.fit(X_train, y_train)

# Summarize results
best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_
print("Best: %f using %s" % (best_score, best_params))

Best: 0.854979 using {'max_iter': 100, 'tol': 0.01}


## Evaluating best performing logistic regression model

In [112]:
# Use grid_model to predict instances from the test set and store it
y_pred = grid_model.predict(X_test)

# Get the accuracy score of grid_model and print it
print("Accuracy of the best logistic regression classifier: ",
      grid_model.score(X_test, y_test))

# Get the f1 score of grid_model and print it
print("F1 Score of the best logistic regression classifier",
      f1_score(y_test, y_pred))

# Print the confusion matrix of the logreg model
print("Confusion matrix of the best logistic regression classifier:\n", 
      confusion_matrix(y_test, y_pred))

Accuracy of the best logistic regression classifier:  0.8508771929824561
F1 Score of the best logistic regression classifier 0.8559322033898304
Confusion matrix of the best logistic regression classifier:
 [[ 93  10]
 [ 24 101]]


## Training and evaluating a k-NN model

In [113]:
knn_param_grid = {'n_neighbors': np.arange(1, 50)}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, knn_param_grid, cv=5)
knn_grid_model_result = knn_cv.fit(X_train, y_train)
best_knn_score, best_knn_params = knn_grid_model_result.best_score_, knn_grid_model_result.best_params_
print("k-NN Model - Best score: %f using %s" % (best_knn_score, best_knn_params))

# Predict instances from the test set and store it
y_pred = knn_grid_model_result.predict(X_test)

# Get the accuracy score and print it
print("Accuracy of the best knn classifier: ",
      knn_grid_model_result.score(X_test, y_test))

# Get the f1 score and print it
print("F1 Score of the best knn classifier",
      f1_score(y_test, y_pred))

# Print the confusion matrix of the logreg model
print("Confusion matrix of the best knn classifier:\n", 
      confusion_matrix(y_test, y_pred))

k-NN Model - Best score: 0.863675 using {'n_neighbors': 14}
Accuracy of the best knn classifier:  0.8464912280701754
F1 Score of the best knn classifier 0.859437751004016
Confusion matrix of the best knn classifier:
 [[ 86  17]
 [ 18 107]]


## Training and evaluating a linear SVM model

In [114]:
svc_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'gamma': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]
}
svc = SVC(kernel='linear')
svc_cv = GridSearchCV(svc, svc_param_grid, cv=5)
svc_grid_model_result = svc_cv.fit(X_train, y_train)
best_svc_score, best_svc_params = svc_grid_model_result.best_score_, svc_grid_model_result.best_params_
print("SVM with linear kernel - Best score: %f using %s" % (best_svc_score, best_svc_params))

# Predict instances from the test set and store it
y_pred = svc_grid_model_result.predict(X_test)

# Get the accuracy score and print it
print("Accuracy of the best linear SVM classifier: ",
      svc_grid_model_result.score(X_test, y_test))

# Get the f1 score and print it
print("F1 Score of the best linear SVM classifier",
      f1_score(y_test, y_pred))

# Print the confusion matrix of the logreg model
print("Confusion matrix of the best linear SVM classifier:\n", 
      confusion_matrix(y_test, y_pred))

SVM with linear kernel - Best score: 0.863628 using {'C': 0.01, 'gamma': 1e-05}
Accuracy of the best linear SVM classifier:  0.8421052631578947
F1 Score of the best linear SVM classifier 0.8434782608695652
Confusion matrix of the best linear SVM classifier:
 [[95  8]
 [28 97]]


## Training and evaluating a RBF SVM model

In [115]:
rbf = SVC()
pipeline = Pipeline(steps=[('rbf', rbf)])
rbf_param_grid = {
    'rbf__C': [0.01, 0.1, 1, 10, 100],
    'rbf__gamma': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]
}

rbf_cv = GridSearchCV(pipeline, rbf_param_grid, cv=5)
rbf_grid_model_result = rbf_cv.fit(X_train, y_train)
best_rbf_score, best_rbf_params = rbf_grid_model_result.best_score_, rbf_grid_model_result.best_params_
print("SVM with RBF kernel - Best: %f using %s" % (best_rbf_score, best_rbf_params))

# Predict instances from the test set and store it
y_pred = rbf_grid_model_result.predict(X_test)

# Get the accuracy score and print it
print("Accuracy of the best rbf SVM classifier: ",
      rbf_grid_model_result.score(X_test, y_test))

# Get the f1 score and print it
print("F1 Score of the best rbf SVM classifier",
      f1_score(y_test, y_pred))

# Print the confusion matrix of the logreg model
print("Confusion matrix of the best rbf SVM classifier:\n", 
      confusion_matrix(y_test, y_pred))

SVM with RBF kernel - Best: 0.861454 using {'rbf__C': 1, 'rbf__gamma': 0.01}
Accuracy of the best rbf SVM classifier:  0.8421052631578947
F1 Score of the best rbf SVM classifier 0.8434782608695652
Confusion matrix of the best rbf SVM classifier:
 [[95  8]
 [28 97]]


## Training and evaluating a decision tree classifier

In [117]:
dt = DecisionTreeClassifier()

pipeline = Pipeline(steps=[('dt', dt)])

criterion = ['gini', 'entropy']

max_depth = [2,4,6,8,10,12]

param_grid = dict(dt__criterion=criterion,
                  dt__max_depth=max_depth)

grid_model = GridSearchCV(pipeline, param_grid, cv=5)
grid_model.fit(X_train, y_train)

best_dt_score, best_dt_params = grid_model.best_score_, grid_model.best_params_
print("Decision Tree Classifier - Best: %f using %s" % (best_dt_score, best_dt_params))

# Predict instances from the test set and store it
y_pred = grid_model.predict(X_test)

# Get the accuracy score and print it
print("Accuracy of the best decision tree classifier: ",
      grid_model.score(X_test, y_test))

# Get the f1 score and print it
print("F1 Score of the best decision tree classifier",
      f1_score(y_test, y_pred))

# Print the confusion matrix of the logreg model
print("Confusion matrix of the best decision tree classifier:\n", 
      confusion_matrix(y_test, y_pred))

Decision Tree Classifier - Best: 0.861431 using {'dt__criterion': 'gini', 'dt__max_depth': 2}
Accuracy of the best decision tree classifier:  0.8464912280701754
F1 Score of the best decision tree classifier 0.8471615720524018
Confusion matrix of the best decision tree classifier:
 [[96  7]
 [28 97]]


## Train and evaluate a bagging model

In [118]:
bc = BaggingClassifier()
pipeline = Pipeline(steps=[('bc', bc)])

n_estimators = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]

param_grid = dict(bc__n_estimators=n_estimators)

grid_model = GridSearchCV(pipeline, param_grid, cv=5)
grid_model.fit(X_train, y_train)

best_bc_score, best_bc_params = grid_model.best_score_, grid_model.best_params_
print("Bagging Classifiier - Best: %f using %s" % (best_bc_score, best_bc_params))

# Predict instances from the test set and store it
y_pred = grid_model.predict(X_test)

# Get the accuracy score and print it
print("Accuracy of the best decision tree classifier: ",
      grid_model.score(X_test, y_test))

# Get the f1 score and print it
print("F1 Score of the best decision tree classifier",
      f1_score(y_test, y_pred))

# Print the confusion matrix of the logreg model
print("Confusion matrix of the best decision tree classifier:\n", 
      confusion_matrix(y_test, y_pred))

Bagging Classifiier - Best: 0.876554 using {'bc__n_estimators': 200}
Accuracy of the best decision tree classifier:  0.868421052631579
F1 Score of the best decision tree classifier 0.8770491803278688
Confusion matrix of the best decision tree classifier:
 [[ 91  12]
 [ 18 107]]


## Training and evaluating a Random Forests model

In [120]:
rf = RandomForestClassifier()
pipeline = Pipeline(steps=[('rf', rf)])

n_estimators = [300, 350, 400, 450, 500, 550, 600]

param_grid = dict(rf__n_estimators=n_estimators)

grid_model = GridSearchCV(pipeline, param_grid, cv=5)
grid_model.fit(X_train, y_train)

best_rf_score, best_rf_params = grid_model.best_score_, grid_model.best_params_
print("Random Forests Classifier - Best: %f using %s" % (best_rf_score, best_rf_params))
# print('Best number of estimators:', grid_model.best_estimator_.get_params()['rf__n_estimators'])

y_pred = grid_model.predict(X_test)
print("Accuracy: ", grid_model.score(X_test, y_test))
print("F1 Score: ", f1_score(y_test, y_pred))
print("Confusion matrix\n", confusion_matrix(y_test, y_pred))

Random Forests Classifier - Best: 0.874404 using {'rf__n_estimators': 550}
Accuracy:  0.8640350877192983
F1 Score:  0.8755020080321285
Confusion matrix
 [[ 88  15]
 [ 16 109]]


## Training and evaluating an Adaboost model

In [121]:
ada = AdaBoostClassifier()
pipeline = Pipeline(steps=[('ada', ada)])

n_estimators = [1, 5, 10, 15, 20, 25]

param_grid = dict(ada__n_estimators=n_estimators)

grid_model = GridSearchCV(pipeline, param_grid, cv=5)
grid_model.fit(X_train, y_train)

best_ada_score, best_ada_params = grid_model.best_score_, grid_model.best_params_
print("AdaBoost Classifier - Best: %f using %s" % (best_ada_score, best_ada_params))
# print('Best number of estimators:', grid_model.best_estimator_.get_params()['ada__n_estimators'])

y_pred = grid_model.predict(X_test)
print("Accuracy: ", grid_model.score(X_test, y_test))
print("F1 Score: ", f1_score(y_test, y_pred))
print("Confusion matrix\n", confusion_matrix(y_test, y_pred))

AdaBoost Classifier - Best: 0.870126 using {'ada__n_estimators': 15}
Accuracy:  0.8289473684210527
F1 Score:  0.8354430379746836
Confusion matrix
 [[90 13]
 [26 99]]


## Training and evaluating a Gradient Boost model

In [122]:
gb = GradientBoostingClassifier()

pipeline = Pipeline(steps=[('gb', gb)])

n_estimators = [1, 10, 50, 100, 150, 200]

param_grid = dict(gb__n_estimators=n_estimators)

grid_model = GridSearchCV(pipeline, param_grid, cv=5)
grid_model.fit(X_train, y_train)

best_gb_score, best_gb_params = grid_model.best_score_, grid_model.best_params_
print("AdaBoost Classifier - Best: %f using %s" % (best_gb_score, best_gb_params))
# print('Best number of estimators:', grid_model.best_estimator_.get_params()['gb__n_estimators'])

y_pred = grid_model.predict(X_test)
print("Accuracy: ", grid_model.score(X_test, y_test))
print("F1 Score: ", f1_score(y_test, y_pred))
print("Confusion matrix\n", confusion_matrix(y_test, y_pred))

AdaBoost Classifier - Best: 0.855026 using {'gb__n_estimators': 50}
Accuracy:  0.8508771929824561
F1 Score:  0.8571428571428571
Confusion matrix
 [[ 92  11]
 [ 23 102]]
