In [2]:
#Importing Libraries:
!pip install catboost
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

from catboost import Pool
from catboost import CatBoostClassifier

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [3]:
#Loading DataFrame:
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/695 project/cleaned_fraud_dataset.csv'
df = pd.read_csv(file_path)

Mounted at /content/drive


In [4]:
df.columns

Index(['timestamp', 'amount', 'transaction_type', 'merchant_category',
       'location', 'device_used', 'is_fraud', 'spending_deviation_score',
       'velocity_score', 'geo_anomaly_score', 'payment_channel'],
      dtype='object')

In [5]:
df.describe()

Unnamed: 0,amount,spending_deviation_score,velocity_score,geo_anomaly_score
count,538659.0,538659.0,538659.0,538659.0
mean,358.857463,0.000609,10.506365,0.500213
std,469.499824,1.001672,5.765816,0.288379
min,0.01,-4.59,1.0,0.0
25%,26.51,-0.67,6.0,0.25
50%,138.92,0.0,11.0,0.5
75%,503.38,0.67,16.0,0.75
max,3141.29,4.9,20.0,1.0


In [6]:
df['timestamp']

Unnamed: 0,timestamp
0,2023-12-18
1,2023-02-06
2,2023-07-26
3,2023-04-27
4,2023-03-14
...,...
538654,2023-11-23
538655,2023-05-01
538656,2023-04-18
538657,2023-09-06


In [7]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month
df['day'] = df['timestamp'].dt.day

In [8]:
#X and y data split:

X = df.drop('is_fraud', axis = 1)
y = df['is_fraud']

In [9]:
#Categorical Data:

categorical_features = ['transaction_type', 'merchant_category', 'location', 'device_used', 'payment_channel', 'year', 'month', 'day']

In [10]:
#Training split:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 36)

In [11]:
#CatBoost Pool:

train_pool = Pool(data = X_train, label = y_train, cat_features = categorical_features)
test_pool = Pool(data = X_test, label = y_test, cat_features = categorical_features)

In [12]:
#Training Model:

model = CatBoostClassifier(
    iterations=100,
    learning_rate=0.05,
    depth=6,
    verbose=1,
    random_seed = 19
)

model.fit(train_pool)

0:	learn: 0.6877258	total: 97.2ms	remaining: 9.62s
1:	learn: 0.6828319	total: 162ms	remaining: 7.94s
2:	learn: 0.6784105	total: 226ms	remaining: 7.29s
3:	learn: 0.6744163	total: 270ms	remaining: 6.47s
4:	learn: 0.6708034	total: 313ms	remaining: 5.96s
5:	learn: 0.6675380	total: 357ms	remaining: 5.59s
6:	learn: 0.6645832	total: 390ms	remaining: 5.18s
7:	learn: 0.6619093	total: 429ms	remaining: 4.93s
8:	learn: 0.6594899	total: 472ms	remaining: 4.78s
9:	learn: 0.6573002	total: 494ms	remaining: 4.45s
10:	learn: 0.6553181	total: 527ms	remaining: 4.26s
11:	learn: 0.6535244	total: 571ms	remaining: 4.19s
12:	learn: 0.6518999	total: 616ms	remaining: 4.12s
13:	learn: 0.6504294	total: 658ms	remaining: 4.04s
14:	learn: 0.6490973	total: 702ms	remaining: 3.98s
15:	learn: 0.6478909	total: 748ms	remaining: 3.92s
16:	learn: 0.6467985	total: 792ms	remaining: 3.87s
17:	learn: 0.6458107	total: 826ms	remaining: 3.76s
18:	learn: 0.6449125	total: 870ms	remaining: 3.71s
19:	learn: 0.6441015	total: 914ms	remain

<catboost.core.CatBoostClassifier at 0x7b20bbc91ad0>

In [13]:
y_pred = model.predict(test_pool)
y_proba = model.predict_proba(test_pool)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print('f1 score', f1_score(y_test, y_pred))
print('Precision', precision_score(y_test, y_pred))
print('Recall score', recall_score(y_test, y_pred))
print('ROC AUC score', roc_auc_score(y_test, y_proba))

Accuracy: 0.6663386923105484
f1 score 0.0
Precision 0.0
Recall score 0.0
ROC AUC score 0.4987652130086399


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Class Imbalance present in the dataset. Thus, F1 score, Precision, and Recall score are 0. No positive scores are being predicted.

In [14]:
print(y_train.value_counts())

is_fraud
False    287320
True     143607
Name: count, dtype: int64


In [15]:
#Training Model:

model_1 = CatBoostClassifier(
    auto_class_weights = 'Balanced',
    iterations=3000,
    learning_rate=0.01,
    depth=6,
    verbose=1,
    random_seed = 36
)

model_1.fit(train_pool)

0:	learn: 0.6931417	total: 185ms	remaining: 9m 16s
1:	learn: 0.6931417	total: 251ms	remaining: 6m 16s
2:	learn: 0.6931383	total: 402ms	remaining: 6m 42s
3:	learn: 0.6931383	total: 493ms	remaining: 6m 9s
4:	learn: 0.6931369	total: 655ms	remaining: 6m 32s
5:	learn: 0.6931369	total: 754ms	remaining: 6m 16s
6:	learn: 0.6931314	total: 897ms	remaining: 6m 23s
7:	learn: 0.6931304	total: 1.06s	remaining: 6m 38s
8:	learn: 0.6931300	total: 1.25s	remaining: 6m 55s
9:	learn: 0.6931298	total: 1.39s	remaining: 6m 55s
10:	learn: 0.6931295	total: 1.56s	remaining: 7m 4s
11:	learn: 0.6931288	total: 1.72s	remaining: 7m 8s
12:	learn: 0.6931277	total: 1.88s	remaining: 7m 11s
13:	learn: 0.6931267	total: 2.02s	remaining: 7m 10s
14:	learn: 0.6931267	total: 2.08s	remaining: 6m 54s
15:	learn: 0.6931261	total: 2.23s	remaining: 6m 55s
16:	learn: 0.6931261	total: 2.28s	remaining: 6m 40s
17:	learn: 0.6931251	total: 2.44s	remaining: 6m 43s
18:	learn: 0.6931242	total: 2.55s	remaining: 6m 39s
19:	learn: 0.6931235	tota

<catboost.core.CatBoostClassifier at 0x7b20bbb06f90>

In [16]:
y_pred = model_1.predict(test_pool)
y_proba = model_1.predict_proba(test_pool)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print('f1 score', f1_score(y_test, y_pred))
print('Precision', precision_score(y_test, y_pred))
print('Recall score', recall_score(y_test, y_pred))
print('ROC AUC score', roc_auc_score(y_test, y_proba))

Accuracy: 0.5136078416812089
f1 score 0.4111697943589167
Precision 0.34490234522283386
Recall score 0.508957881266344
ROC AUC score 0.5178554343586753


In [17]:
model_2 = CatBoostClassifier(
    auto_class_weights = 'Balanced',
    iterations=5000,
    learning_rate=0.2,
    depth=6,
    verbose=1,
    random_seed = 36
)

model_2.fit(train_pool)

0:	learn: 0.6930472	total: 171ms	remaining: 14m 13s
1:	learn: 0.6930472	total: 217ms	remaining: 9m 1s
2:	learn: 0.6930362	total: 460ms	remaining: 12m 46s
3:	learn: 0.6930171	total: 701ms	remaining: 14m 35s
4:	learn: 0.6930043	total: 951ms	remaining: 15m 49s
5:	learn: 0.6929749	total: 1.19s	remaining: 16m 31s
6:	learn: 0.6929727	total: 1.32s	remaining: 15m 41s
7:	learn: 0.6929695	total: 1.48s	remaining: 15m 25s
8:	learn: 0.6929561	total: 1.72s	remaining: 15m 54s
9:	learn: 0.6929436	total: 1.91s	remaining: 15m 52s
10:	learn: 0.6929381	total: 2.06s	remaining: 15m 35s
11:	learn: 0.6929380	total: 2.21s	remaining: 15m 18s
12:	learn: 0.6929261	total: 2.45s	remaining: 15m 39s
13:	learn: 0.6928670	total: 2.65s	remaining: 15m 45s
14:	learn: 0.6928670	total: 2.73s	remaining: 15m 8s
15:	learn: 0.6928582	total: 2.92s	remaining: 15m 8s
16:	learn: 0.6928554	total: 3.04s	remaining: 14m 52s
17:	learn: 0.6928430	total: 3.23s	remaining: 14m 53s
18:	learn: 0.6928367	total: 3.41s	remaining: 14m 53s
19:	lea

<catboost.core.CatBoostClassifier at 0x7b20bbe35c10>

In [18]:
y_pred = model_2.predict(test_pool)
y_proba = model_2.predict_proba(test_pool)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print('f1 score', f1_score(y_test, y_pred))
print('Precision', precision_score(y_test, y_pred))
print('Recall score', recall_score(y_test, y_pred))
print('ROC AUC score', roc_auc_score(y_test, y_proba))

Accuracy: 0.5309750120669811
f1 score 0.3752828159192908
Precision 0.33773950196942387
Recall score 0.42221665832081456
ROC AUC score 0.5029642264887563


## Hyperparameter Tunning

In [19]:
learning_rates = [0.01, 0.05, 0.1, 0.2]
depths = [4, 6, 8]
iterations_list = [1000, 2000, 5000]

best_f1 = 0
best_params = {}
best_model = None

for lr in learning_rates:
    for depth in depths:
        for iters in iterations_list:
            model = CatBoostClassifier(
                auto_class_weights='Balanced',
                iterations=iters,
                learning_rate=lr,
                depth=depth,
                verbose=1,
                random_seed=36
            )
            model.fit(train_pool)

            y_pred = model.predict(test_pool)
            y_proba = model.predict_proba(test_pool)[:, 1]

            f1 = f1_score(y_test, y_pred)

            if f1 > best_f1:
                best_f1 = f1
                best_params = {
                    'learning_rate': lr,
                    'depth': depth,
                    'iterations': iters
                }
                best_model = model

print("\nBest Parameters:")
print(best_params)
print(f"Best F1 Score: {best_f1:.4f}")

y_pred = best_model.predict(test_pool)
y_proba = best_model.predict_proba(test_pool)[:, 1]

print("\nEvaluation of Best Model:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('ROC AUC Score:', roc_auc_score(y_test, y_proba))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
11:	learn: 0.6926753	total: 1.78s	remaining: 12m 20s
12:	learn: 0.6926753	total: 1.82s	remaining: 11m 40s
13:	learn: 0.6926382	total: 2.02s	remaining: 12m
14:	learn: 0.6925852	total: 2.26s	remaining: 12m 30s
15:	learn: 0.6925486	total: 2.46s	remaining: 12m 45s
16:	learn: 0.6925486	total: 2.5s	remaining: 12m 13s
17:	learn: 0.6925467	total: 2.63s	remaining: 12m 7s
18:	learn: 0.6924693	total: 2.83s	remaining: 12m 21s
19:	learn: 0.6924666	total: 2.96s	remaining: 12m 18s
20:	learn: 0.6924666	total: 3.03s	remaining: 11m 57s
21:	learn: 0.6924658	total: 3.11s	remaining: 11m 44s
22:	learn: 0.6924650	total: 3.19s	remaining: 11m 31s
23:	learn: 0.6924323	total: 3.41s	remaining: 11m 46s
24:	learn: 0.6924323	total: 3.46s	remaining: 11m 28s
25:	learn: 0.6924322	total: 3.55s	remaining: 11m 19s
26:	learn: 0.6923667	total: 3.86s	remaining: 11m 50s
27:	learn: 0.6923519	total: 4.13s	remaining: 12m 14s
28:	learn: 0.6923516	total: 4.21s	remain

## European Dataset Model

In [20]:
df = pd.read_csv('creditcard_2023.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'creditcard_2023.csv'

In [None]:
X = df.drop('Class', axis=1)
y = df['Class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)

In [None]:
model = CatBoostClassifier(
    auto_class_weights='Balanced',
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    random_seed=42,
    early_stopping_rounds=50,
    eval_metric='AUC',
    verbose=100
)

In [None]:
model.fit(train_pool)

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))