In [31]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import f1_score, accuracy_score
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, cv, Pool
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

In [32]:
all_data = pd.read_csv('creditcard_2023.csv')
all_data.drop(columns = ['id'], inplace = True)

In [33]:
all_data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-0.260648,-0.469648,2.496266,-0.083724,0.129681,0.732898,0.519014,-0.130006,0.727159,0.637735,...,-0.110552,0.217606,-0.134794,0.165959,0.12628,-0.434824,-0.08123,-0.151045,17982.1,0
1,0.9851,-0.356045,0.558056,-0.429654,0.27714,0.428605,0.406466,-0.133118,0.347452,0.529808,...,-0.194936,-0.605761,0.079469,-0.577395,0.19009,0.296503,-0.248052,-0.064512,6531.37,0
2,-0.260272,-0.949385,1.728538,-0.457986,0.074062,1.419481,0.743511,-0.095576,-0.261297,0.690708,...,-0.00502,0.702906,0.945045,-1.154666,-0.605564,-0.312895,-0.300258,-0.244718,2513.54,0
3,-0.152152,-0.508959,1.74684,-1.090178,0.249486,1.143312,0.518269,-0.06513,-0.205698,0.575231,...,-0.146927,-0.038212,-0.214048,-1.893131,1.003963,-0.51595,-0.165316,0.048424,5384.44,0
4,-0.20682,-0.16528,1.527053,-0.448293,0.106125,0.530549,0.658849,-0.21266,1.049921,0.968046,...,-0.106984,0.729727,-0.161666,0.312561,-0.414116,1.071126,0.023712,0.419117,14278.97,0


### [Dataset Source](https://www.kaggle.com/datasets/nelgiriyewithana/credit-card-fraud-detection-dataset-2023)


### Attribute Information:
<ul>
<li><b>V1-V28</b>: Anonymized features representing various transaction attributes (e.g., time, location, etc.)</li>
<li><b>Amount</b>: The transaction amount</li>
<li><b>Class</b>: : Binary label indicating whether the transaction is fraudulent (1) or not (0)</li>
</ul>

In [34]:
X, y = all_data.drop(columns = 'Class'), all_data['Class']

In [35]:
skf = StratifiedKFold(shuffle = True, random_state=42)

## Training the models

In [36]:
results = pd.DataFrame(columns = ['f1', 'accuracy'])

### Decision tree

In [37]:
dt = tree.DecisionTreeClassifier(max_depth = 5)

In [38]:
dt_f1_mean = cross_val_score(dt, X, y, cv=skf, scoring='f1', n_jobs=-1).mean()
dt_acc_mean = cross_val_score(dt, X, y, cv=skf, scoring='accuracy', n_jobs=-1).mean()

results.loc['Decision Tree'] = [dt_f1_mean, dt_acc_mean]

### Random forest

In [39]:
rf = RandomForestClassifier(n_estimators = 10, max_depth = 5, n_jobs=-1)

In [40]:
rf_f1_mean = cross_val_score(rf, X, y, cv=skf, scoring='f1').mean()
rf_acc_mean = cross_val_score(rf, X, y, cv=skf, scoring='accuracy').mean()

results.loc['Random Forest'] = [rf_f1_mean, rf_acc_mean]

### xGboost

In [41]:
xgb = XGBClassifier(learning_rate=0.02,
                    n_estimators=100,
                    objective='binary:logistic',
                    nthread = -1)

In [42]:
xgb_f1_mean = cross_val_score(xgb, X, y, cv=skf, scoring='f1').mean()
xgb_acc_mean = cross_val_score(xgb, X, y, cv=skf, scoring='accuracy').mean()

results.loc['xGboost'] = [xgb_f1_mean, xgb_acc_mean]

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_catego

### CatBoost

In [43]:
cat_features = X.select_dtypes(include=['object']).columns.tolist()
cat = CatBoostClassifier(cat_features=cat_features)

In [44]:
def cv_scores(cv_data):
    cv_data.head(10)

    best_acc_value = cv_data['test-Accuracy-mean'].max()
    best_acc_iter = cv_data['test-Accuracy-mean'].values.argmax()

    best_f1_value = cv_data['test-F1-mean'].max()
    best_f1_iter = cv_data['test-F1-mean'].values.argmax()
    return best_f1_value, best_acc_value

In [45]:
cat_params = {'loss_function': 'Logloss',
              'iterations': 200,
              'custom_loss': ['F1', 'Accuracy'],
              'learning_rate': 0.5,}

In [46]:
train_pool = Pool(data=X, label=y, cat_features=cat_features, has_header=True)
cv_data = cv(params = cat_params,
             pool = train_pool,
             verbose = False,
             folds = skf,
             plot=True)

cat_f1_mean, cat_acc_mean = cv_scores(cv_data)

results.loc['CatBoost'] = [cat_f1_mean, cat_acc_mean]

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]

bestTest = 0.002671956932
bestIteration = 199

Training on fold [1/5]

bestTest = 0.003085776312
bestIteration = 187

Training on fold [2/5]

bestTest = 0.002738301156
bestIteration = 199

Training on fold [3/5]

bestTest = 0.003215283305
bestIteration = 198

Training on fold [4/5]

bestTest = 0.002910429474
bestIteration = 199



### kNN

In [47]:
decomp = TruncatedSVD(n_components=10)
knn = KNeighborsClassifier(n_jobs = -1)

knn_pipe = Pipeline(steps=[('svd', decomp),
                           ('knn', knn)])


In [48]:
knn_f1_mean = cross_val_score(knn_pipe, X, y, cv=skf, scoring='f1').mean()
knn_acc_mean = cross_val_score(knn_pipe, X, y, cv=skf, scoring='accuracy').mean()

results.loc['KNN'] = [knn_f1_mean, knn_acc_mean]

### Naive Bayes

In [49]:
gnb = GaussianNB()
gnb_pipe = Pipeline(steps=[('svd', decomp),
                           ('gnb', gnb)])

In [50]:
gnb_f1_mean = cross_val_score(gnb_pipe, X, y, cv=skf, scoring='f1', n_jobs=-1).mean()
gnb_acc_mean = cross_val_score(gnb_pipe, X, y, cv=skf, scoring='accuracy', n_jobs=-1).mean()

results.loc['GaussianNB'] = [gnb_f1_mean, gnb_acc_mean]

### Logistic Regression

In [51]:
lr = LogisticRegression()
lr_pipe = Pipeline(steps=[('svd', decomp),
                           ('lr', lr)])

In [52]:
lr_f1_mean = cross_val_score(lr, X, y, cv=skf, scoring='f1', n_jobs=-1).mean()
lr_acc_mean = cross_val_score(lr, X, y, cv=skf, scoring='accuracy', n_jobs=-1).mean()

results.loc['LogisticRegression'] = [lr_f1_mean, lr_acc_mean]

## Results

In [53]:
results

Unnamed: 0,f1,accuracy
Decision Tree,0.960225,0.960549
Random Forest,0.952145,0.953311
xGboost,0.978255,0.978513
CatBoost,0.999445,0.999444
KNN,0.925539,0.929937
GaussianNB,0.928499,0.93159
LogisticRegression,0.95188,0.953214


In [54]:
results.to_csv('fraud_detection_resuts.csv')