## Name
### <u>Yi Huang</u>

## Research Question/ Hypothesis
### <u>Credit Card Fraud Detection with Unknown Feature Data</u>

## Load Data

In [None]:
!pip install category_encoders
!pip install -U imbalanced-learn

In [80]:
from   category_encoders          import *
import numpy as np
import pandas as pd
from   sklearn.compose            import *
from   sklearn.ensemble           import RandomForestClassifier, GradientBoostingClassifier
from   sklearn.experimental       import enable_iterative_imputer
from   sklearn.impute             import *
from   sklearn.metrics            import roc_auc_score # We have not covered it yet in class. The basics - AUC is from 0 to 1 and higher is better.
from   imblearn.pipeline          import Pipeline
from   sklearn.preprocessing      import *
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition   import PCA
from   sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from   imblearn.pipeline          import make_pipeline
from   sklearn.metrics            import balanced_accuracy_score
import imblearn
from sklearn.metrics import confusion_matrix

In [35]:
url = 'https://raw.githubusercontent.com/yihuang1995/Fraud_Detection_with_Unknown_Features/main/creditcard_down.csv'
data = pd.read_csv(url)

In [36]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [37]:
# Split data with balanced class
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state = 42,test_size = 0.2,stratify=y)

# Model1: PCA + Random Forest

Due to 28 columns' definitions are unknown, for model1's feature engineering I applied PCA to demcompose the dataset.

In [38]:
# We need standardize the data before PCA, the data has no missing values
pipe = make_pipeline(StandardScaler(),
                     PCA(),
                     imblearn.over_sampling.SMOTE(),
                     RandomForestClassifier()
                    )

In [39]:
hyperparameters = dict(randomforestclassifier__n_estimators     = [50,100,150,200,250,300], #control the number of trees
                       randomforestclassifier__max_depth        = range(3, 10), # prevent over fitting
                       randomforestclassifier__min_samples_leaf = range(1, 15), # prevent over fitting
                       randomforestclassifier__bootstrap=[True,False], #whether bootstrap samples are used
                       randomforestclassifier__criterion= ["gini", "entropy"],# criteria for splitting
                       pca__n_components = range(2,9)) # control the output group of feature columns 
clf_rand_cv = RandomizedSearchCV(estimator=pipe, 
                              param_distributions=hyperparameters, 
                              n_iter=25, #25
                              cv=5, 
                              n_jobs=-1,
                              verbose=False)
clf_rand_cv.fit(X_train, y_train)
clf_rand_cv.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('pca', PCA(n_components=8)), ('smote', SMOTE()),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=False, criterion='entropy',
                                        max_depth=5, min_samples_leaf=9,
                                        n_estimators=250))])

In [40]:
model1 = clf_rand_cv.best_estimator_
model1.fit(X_train,y_train.values.ravel())

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('pca', PCA(n_components=8)), ('smote', SMOTE()),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=False, criterion='entropy',
                                        max_depth=5, min_samples_leaf=9,
                                        n_estimators=250))])

In [66]:
# Metric 1 balanced_accuracy_score
y_pred1 = model1.predict(X_test)
balanced_accuracy_score(y_test.values.ravel(), y_pred1)

0.9316430125349633

# Model2: Feature Selection + GradientBoosting

EDA shows that after performing log transformation, certain features' correlation with y get higher, I will perform column transformation on those columns.

In [63]:
log_col = ['V8', 'V21', 'V22', 'V23', 'V27', 'V28', 'Amount']
def log_transform(X):
    return np.where(X>0, np.log(X), 0)
preprocessing = ColumnTransformer(transformers=[('log', FunctionTransformer(log_transform, validate=False), log_col)], 
                                  remainder='passthrough')
pipe2 = Pipeline([('log_transform',preprocessing),
                 ('Sampling',imblearn.over_sampling.SMOTE()),
                 ('GB',GradientBoostingClassifier())])

In [65]:
np.seterr(divide = 'ignore',invalid = 'ignore')
hyperparameters = dict(GB__n_estimators     = [50,100,150,200,250,300], #control the number of trees
                       GB__max_depth        = range(3, 10), # prevent over fitting
                       GB__min_samples_leaf = range(1, 10), # prevent over fitting
                       GB__subsample = [0.6,0.8,1]) # sample number for fitting 
gb_rand_cv = RandomizedSearchCV(estimator=pipe2, 
                              param_distributions=hyperparameters, 
                              n_iter=25, #25
                              cv=3, 
                              n_jobs=-1,
                              verbose=False)
gb_rand_cv.fit(X_train, y_train)
gb_rand_cv.best_estimator_

  return np.where(X>0, np.log(X), 0)
  return np.where(X>0, np.log(X), 0)


Pipeline(steps=[('log_transform',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('log',
                                                  FunctionTransformer(func=<function log_transform at 0x12181a8b0>),
                                                  ['V8', 'V21', 'V22', 'V23',
                                                   'V27', 'V28', 'Amount'])])),
                ('Sampling', SMOTE()),
                ('GB',
                 GradientBoostingClassifier(max_depth=9, min_samples_leaf=4,
                                            n_estimators=300, subsample=1))])

In [71]:
model2 = gb_rand_cv.best_estimator_
model2.fit(X_train,y_train.values.ravel())

Pipeline(steps=[('log_transform',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('log',
                                                  FunctionTransformer(func=<function log_transform at 0x12181a8b0>),
                                                  ['V8', 'V21', 'V22', 'V23',
                                                   'V27', 'V28', 'Amount'])])),
                ('Sampling', SMOTE()),
                ('GB',
                 GradientBoostingClassifier(max_depth=9, min_samples_leaf=4,
                                            n_estimators=300, subsample=1))])

In [72]:
# Metric 1 balanced_accuracy_score
y_pred2 = model2.predict(X_test)
balanced_accuracy_score(y_test.values.ravel(), y_pred2)

0.9418470941676163

# Evaluation Metric 

In [73]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


In [79]:
balanced_accuracy_1 = balanced_accuracy_score(y_test.values.ravel(), y_pred1)
f1_1 = f1_score(y_test.values.ravel(), y_pred1, average='binary')
precision_1 = precision_score(y_test.values.ravel(), y_pred1, average='binary')
recall_1 = recall_score(y_test.values.ravel(), y_pred1, average='binary')
balanced_accuracy_2 = balanced_accuracy_score(y_test.values.ravel(), y_pred2)
f1_2 = f1_score(y_test.values.ravel(), y_pred2, average='binary')
precision_2 = precision_score(y_test.values.ravel(), y_pred2, average='binary')
recall_2 = recall_score(y_test.values.ravel(), y_pred2, average='binary')

print("Model1 Test Accuracy: {}".format(balanced_accuracy_1))  # subset accuracy
print("Model1 Test F1 score: {}".format(f1_1))
print("Model1 Test Precision: {}".format(precision_1))
print("Model1 Test Recall: {}".format(recall_1))
print('---------------------------')
print("Model2 Test Accuracy: {}".format(balanced_accuracy_2))  # subset accuracy
print("Model2 Test F1 score: {}".format(f1_2))
print("Model2 Test Precision: {}".format(precision_2))
print("Model2 Test Recall: {}".format(recall_2))

Model1 Test Accuracy: 0.9316430125349633
Model1 Test F1 score: 0.9090909090909092
Model1 Test Precision: 0.9550561797752809
Model1 Test Recall: 0.8673469387755102
---------------------------
Model2 Test Accuracy: 0.9418470941676163
Model2 Test F1 score: 0.9206349206349207
Model2 Test Precision: 0.9560439560439561
Model2 Test Recall: 0.8877551020408163


In [77]:
confusion_matrix(y_test.values.ravel(), y_pred2)

array([[981,   4],
       [ 11,  87]])

# Results
### Best model: Feature Selection + GradientBoosting model

### Final Model Parameters

In [75]:
gb_rand_cv.best_estimator_

Pipeline(steps=[('log_transform',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('log',
                                                  FunctionTransformer(func=<function log_transform at 0x12181a8b0>),
                                                  ['V8', 'V21', 'V22', 'V23',
                                                   'V27', 'V28', 'Amount'])])),
                ('Sampling', SMOTE()),
                ('GB',
                 GradientBoostingClassifier(max_depth=9, min_samples_leaf=4,
                                            n_estimators=300, subsample=1))])

### Summary
* Model: Feature Selection + GradientBoosting model is slightly better than the PCA + RandomFrorest model. One reason might be that PCA reduces the information model can get from the data. Another reason is GradientBoosting algorithm might have better prediction power than the RandomForest.
* Business: This is a fraud detection problem, we care the recall of the model, which means that how many fraud cases can be found from the model. 0.89 recall score means a pretty good model. If we are working as a vendor for our client, some features might be encrypted and unknown the true meaning of it, it is a good practice to know how to do feature engineering on those features.

### Next Steps
* Deeper feature engineering: more transformation, combining features, add new features from original features
* More complex model: Ensamble model, Stacked model