In [1]:
%tensorflow_version 2.x

import tensorflow as tf
print(tf.version)

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.
<module 'tensorflow._api.v2.version' from '/usr/local/lib/python3.10/dist-packages/tensorflow/_api/v2/version/__init__.py'>


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, Input, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from tensorflow.keras import layers
from IPython.display import clear_output


In [3]:
def print_scores(y_true, y_pred, model_name):
    print(f"{model_name} Precision: {precision_score(y_true, y_pred)*100:0.1f}%")
    print(f"{model_name} Recall: {recall_score(y_true, y_pred)*100:0.1f}%")
    print(f"{model_name} F1-Score: {f1_score(y_true, y_pred)*100:0.1f}%")
    print(f"{model_name} Confusion matrix: \n {confusion_matrix(y_true, y_pred)}")




In [4]:
from google.colab import files
files.upload()
clear_output(wait=True)


Saving kaggle.json to kaggle.json


In [5]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [6]:
!kaggle datasets download -d rupakroy/online-payments-fraud-detection-dataset


Dataset URL: https://www.kaggle.com/datasets/rupakroy/online-payments-fraud-detection-dataset
License(s): CC-BY-NC-SA-4.0
Downloading online-payments-fraud-detection-dataset.zip to /content
 99% 176M/178M [00:09<00:00, 22.8MB/s]
100% 178M/178M [00:09<00:00, 19.7MB/s]


In [7]:
!unzip -o online-payments-fraud-detection-dataset.zip


Archive:  online-payments-fraud-detection-dataset.zip
  inflating: PS_20174392719_1491204439457_log.csv  


In [8]:
df = pd.read_csv('PS_20174392719_1491204439457_log.csv')

In [9]:
df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,0.00,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,0.00,1,0


In [10]:
df.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.3972,179861.9,833883.1,855113.7,1100702.0,1224996.0,0.00129082,2.514687e-06
std,142.332,603858.2,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048,0.001585775
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,74871.94,14208.0,0.0,132705.7,214661.4,0.0,0.0
75%,335.0,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0,0.0
max,743.0,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0,1.0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [60]:
df.isFraud.value_counts()


Unnamed: 0_level_0,count
isFraud,Unnamed: 1_level_1
0,6354407
1,8213


In [12]:
df[df['isFlaggedFraud'] == 1].count()

Unnamed: 0,0
step,16
type,16
amount,16
nameOrig,16
oldbalanceOrg,16
newbalanceOrig,16
nameDest,16
oldbalanceDest,16
newbalanceDest,16
isFraud,16


In [13]:
df[~((df['oldbalanceOrg'] - df['newbalanceOrig']) == df['amount'])]


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.0,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.0,0.00,0,0
5,1,PAYMENT,7817.71,C90045638,53860.00,46042.29,M573487274,0.0,0.00,0,0
6,1,PAYMENT,7107.77,C154988899,183195.00,176087.23,M408069119,0.0,0.00,0,0
7,1,PAYMENT,7861.64,C1912850431,176087.23,168225.59,M633326333,0.0,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6362323,718,DEBIT,1864.24,C49652609,20426.00,18561.76,C1799009964,188746.0,190610.24,0,0
6362456,730,TRANSFER,10000000.00,C1277761503,37316255.05,27316255.05,C500987951,0.0,0.00,1,0
6362460,730,TRANSFER,10000000.00,C2140038573,17316255.05,17316255.05,C1395467927,0.0,0.00,1,1
6362462,730,TRANSFER,7316255.05,C1869569059,17316255.05,17316255.05,C1861208726,0.0,0.00,1,1


There seems to be a problem concerning the value of the amount column so I will drop the column and recalculate it

In [14]:
df.pop('amount')

Unnamed: 0,amount
0,9839.64
1,1864.28
2,181.00
3,181.00
4,11668.14
...,...
6362615,339682.13
6362616,6311409.28
6362617,6311409.28
6362618,850002.52


In [15]:
df['amount'] = df['oldbalanceOrg'] - df['newbalanceOrig']

In [16]:
# df.drop(inplace=True, columns = ['nameDest','oldbalanceDest', 'newbalanceDest']) dropping these columns lead to a much worse results

In [62]:
df.duplicated().sum()

0

In [17]:
len(df.nameOrig.unique())

6353307

In [63]:
df.isnull().sum()

Unnamed: 0,0
step,0
type,0
nameOrig,0
oldbalanceOrg,0
newbalanceOrig,0
nameDest,0
oldbalanceDest,0
newbalanceDest,0
isFraud,0
isFlaggedFraud,0


I was considering one hot encoding but that will be introduce alot of columns, So, I will go with label encoding

In [18]:
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
categorical_data = df[categorical_cols]

In [19]:
label_encoder = LabelEncoder()

# Apply label encoding to each categorical column
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

print("\nDataFrame after Label Encoding:")
df



DataFrame after Label Encoding:


Unnamed: 0,step,type,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,amount
0,1,3,757869,170136.00,160296.36,1662094,0.00,0.00,0,0,9839.64
1,1,3,2188998,21249.00,19384.72,1733924,0.00,0.00,0,0,1864.28
2,1,4,1002156,181.00,0.00,439685,0.00,0.00,1,0,181.00
3,1,1,5828262,181.00,0.00,391696,21182.00,0.00,1,0,181.00
4,1,3,3445981,41554.00,29885.86,828919,0.00,0.00,0,0,11668.14
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,1,5651847,339682.13,0.00,505863,0.00,339682.13,1,0,339682.13
6362616,743,4,1737278,6311409.28,0.00,260949,0.00,0.00,1,0,6311409.28
6362617,743,1,533958,6311409.28,0.00,108224,68488.84,6379898.11,1,0,6311409.28
6362618,743,4,2252932,850002.52,0.00,319713,0.00,0.00,1,0,850002.52


In [20]:
y = df['isFraud']
X = df.drop(columns = ['isFraud'])
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=1, shuffle=True)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [21]:
# TPU configuration
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
    print("Running on TPU")
except ValueError:
    print("TPU not found, using CPU/GPU instead")
    strategy = tf.distribute.get_strategy()

TPU not found, using CPU/GPU instead


In [22]:
# TPU strategy scope if available
with strategy.scope():
    model = Sequential([
        Input(shape=(X_train.shape[1],)),
        Dense(128, activation='relu'),
        layers.Dropout(0.3),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')  # For binary classification
    ])

    model.compile(
        optimizer = tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics= [tf.keras.metrics.Recall(),  # Prioritize recall
             tf.keras.metrics.Precision(),
                'accuracy']
    )


In [23]:
y_train = y_train.to_numpy().astype(int)


In [24]:
early_stopping = EarlyStopping(monitor='val_recall', patience=10, mode='max', restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=1)

# Class weights to priotrize class 1
class_weight = {0: 1., 1: 8}

In [25]:
class_weight

{0: 1.0, 1: 8}

In [26]:
batch_size = 32
history = model.fit(X_train, y_train,
                    epochs=10,
                    batch_size=batch_size,
                    validation_split=0.2,
                    verbose=1,
                    callbacks=[early_stopping, reduce_lr],
                    class_weight=class_weight)

Epoch 1/10
[1m143159/143159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m352s[0m 2ms/step - accuracy: 0.9987 - loss: 0.0242 - precision: 0.4895 - recall: 0.5590 - val_accuracy: 0.9994 - val_loss: 0.0037 - val_precision: 0.9441 - val_recall: 0.5671 - learning_rate: 0.0010
Epoch 2/10
[1m143159/143159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m354s[0m 2ms/step - accuracy: 0.9989 - loss: 0.0193 - precision: 0.5722 - recall: 0.6339 - val_accuracy: 0.9993 - val_loss: 0.0055 - val_precision: 0.8313 - val_recall: 0.6006 - learning_rate: 0.0010
Epoch 3/10
[1m143159/143159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m411s[0m 3ms/step - accuracy: 0.9989 - loss: 0.0259 - precision: 0.5609 - recall: 0.6360 - val_accuracy: 0.9992 - val_loss: 0.0041 - val_precision: 0.7498 - val_recall: 0.6535 - learning_rate: 0.0010
Epoch 4/10
[1m143159/143159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m407s[0m 2ms/step - accuracy: 0.9991 - loss: 0.0221 - precision: 0.6257 - recall: 0.6585 - val_a

In [27]:
y_test = y_test.to_numpy().reshape(-1, 1)

In [28]:
y_test = np.squeeze(y_test)
print(X_test.shape)  # Should be (num_samples, num_features)
print(y_test.shape)

(636262, 10)
(636262,)


In [29]:
results = model.evaluate(X_test, y_test, verbose=1)
print(results)

[1m19884/19884[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 2ms/step - accuracy: 0.9991 - loss: 0.0047 - precision: 0.6382 - recall: 0.7423
[0.004719443619251251, 0.721611738204956, 0.6234177350997925, 0.9990805387496948]


In [34]:
y_train_pred_ = model.predict(X_train)

[1m178949/178949[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 1ms/step


In [57]:
# thresholds = [0.4, 0.45, 0.5, 0.55, 0.6]
threshold = 0.61
y_train_pred = (y_train_pred_ > threshold).astype(int).flatten()
print_scores(y_train, y_train_pred, 'Neural Network (Train)')

Neural Network (Train) Precision: 71.3%
Neural Network (Train) Recall: 71.1%
Neural Network (Train) F1-Score: 71.2%
Neural Network (Train) Confusion matrix: 
 [[5716847    2117]
 [   2140    5254]]


Unfortunately I didn't have validation data to tune threshold to but I opted for using training data and monitering tradeoff between Precision and recall

In [30]:
y_pred_ = (model.predict(X_test))

[1m19884/19884[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 1ms/step


In [58]:
y_pred = (y_pred_ > threshold).astype("int32")
print_scores(y_test, y_pred, 'Neural Network (Test)')

Neural Network (Test) Precision: 71.7%
Neural Network (Test) Recall: 70.3%
Neural Network (Test) F1-Score: 71.0%
Neural Network (Test) Confusion matrix: 
 [[635216    227]
 [   243    576]]


This threshold worked out fine as the model isn't overfitting or underfitting

In [64]:
model.save('neural_network_model_6million_ds.keras')