In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras import models, layers, optimizers

In [2]:
data = pd.read_csv("creditcard.csv")
data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [3]:
print("Dataset shape:", data.shape)
print("\nFirst few rows:")
print(data.head())
print("\nClass distribution:")
print(data['Class'].value_counts())
print("\nDataset info:")
print(data.info())

Dataset shape: (284807, 31)

First few rows:
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -

In [4]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data.drop('Class', axis=1))

In [5]:
normal_data = data_scaled[data['Class'] == 0]
fraud_data = data_scaled[data['Class'] == 1]


In [6]:
print(normal_data.shape)
print(fraud_data.shape)

(284315, 30)
(492, 30)


In [7]:
X_train, X_test = train_test_split(normal_data, test_size=0.2, random_state=42)

In [8]:
print(X_train.shape)
print(X_test.shape)

(227452, 30)
(56863, 30)


In [9]:
# 4. Build Autoencoder
input_dim = X_train.shape[1]

encoder = models.Sequential([
    layers.Input(shape=(input_dim,)),
    layers.Dense(16, activation='relu'),
    layers.Dense(8, activation='relu'),
    layers.Dense(4, activation='relu') # bottleneck (latent space)
])

decoder = models.Sequential([
    layers.Input(shape=(4,)),
    layers.Dense(8, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(input_dim, activation='sigmoid') # reconstruct input
])

autoencoder = models.Sequential([encoder, decoder])

In [10]:
# 5. Compile the model
autoencoder.compile(optimizer=optimizers.Adam(learning_rate=0.001),
                    loss='mse',
                    metrics=['mae'])
# Display the model architecture
autoencoder.summary()

In [11]:
# 6. Train the model (only on normal data)
history = autoencoder.fit(
    X_train, X_train,
    epochs=5,
    batch_size=64,
    validation_data=(X_test, X_test),
    verbose=1
)

Epoch 1/5
[1m3554/3554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 0.9557 - mae: 0.6553 - val_loss: 0.8673 - val_mae: 0.6086
Epoch 2/5
[1m3554/3554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - loss: 0.8634 - mae: 0.6069 - val_loss: 0.8530 - val_mae: 0.6004
Epoch 3/5
[1m3554/3554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - loss: 0.8353 - mae: 0.5982 - val_loss: 0.8489 - val_mae: 0.5978
Epoch 4/5
[1m3554/3554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - loss: 0.8554 - mae: 0.5982 - val_loss: 0.8458 - val_mae: 0.5959
Epoch 5/5
[1m3554/3554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - loss: 0.8402 - mae: 0.5949 - val_loss: 0.8405 - val_mae: 0.5933


In [12]:
# 7. Detect anomalies
# Reconstruction error
reconstructions = autoencoder.predict(data_scaled)
mse = np.mean(np.power(data_scaled - reconstructions, 2), axis=1)

threshold = np.percentile(mse, 95)
print("Reconstruction error threshold:", threshold)

# Predict anomalies
predictions = (mse > threshold).astype(int)


[1m8901/8901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 649us/step
Reconstruction error threshold: 1.7488554572077133


In [13]:
# 8. Evaluate
from sklearn.metrics import confusion_matrix, classification_report

print("Confusion Matrix:")
print(confusion_matrix(data['Class'], predictions))

print("\nClassification Report:")
print(classification_report(data['Class'], predictions))

Confusion Matrix:
[[270505  13810]
 [    61    431]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.95      0.98    284315
           1       0.03      0.88      0.06       492

    accuracy                           0.95    284807
   macro avg       0.52      0.91      0.52    284807
weighted avg       1.00      0.95      0.97    284807



In [14]:
import numpy as np

# Pick a random index
random_idx = np.random.randint(0, len(data_scaled))
sample = data_scaled[random_idx].reshape(1, -1)  # reshape for the autoencoder

# Get reconstruction from the autoencoder
reconstructed = autoencoder.predict(sample)

# Compute reconstruction error (MSE)
mse_sample = np.mean(np.power(sample - reconstructed, 2))

# Check against threshold
if mse_sample > threshold:
    print(f"Sample index {random_idx} is predicted as FRAUD (MSE={mse_sample:.6f})")
else:
    print(f"Sample index {random_idx} is predicted as NORMAL (MSE={mse_sample:.6f})")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Sample index 31651 is predicted as NORMAL (MSE=0.364866)


In [18]:
import pandas as pd

# Get indices where class == 1
indices = data.index[data['Class'] == 1].tolist()
print(indices)


[541, 623, 4920, 6108, 6329, 6331, 6334, 6336, 6338, 6427, 6446, 6472, 6529, 6609, 6641, 6717, 6719, 6734, 6774, 6820, 6870, 6882, 6899, 6903, 6971, 8296, 8312, 8335, 8615, 8617, 8842, 8845, 8972, 9035, 9179, 9252, 9487, 9509, 10204, 10484, 10497, 10498, 10568, 10630, 10690, 10801, 10891, 10897, 11343, 11710, 11841, 11880, 12070, 12108, 12261, 12369, 14104, 14170, 14197, 14211, 14338, 15166, 15204, 15225, 15451, 15476, 15506, 15539, 15566, 15736, 15751, 15781, 15810, 16415, 16780, 16863, 17317, 17366, 17407, 17453, 17480, 18466, 18472, 18773, 18809, 20198, 23308, 23422, 26802, 27362, 27627, 27738, 27749, 29687, 30100, 30314, 30384, 30398, 30442, 30473, 30496, 31002, 33276, 39183, 40085, 40525, 41395, 41569, 41943, 42007, 42009, 42473, 42528, 42549, 42590, 42609, 42635, 42674, 42696, 42700, 42741, 42756, 42769, 42784, 42856, 42887, 42936, 42945, 42958, 43061, 43160, 43204, 43428, 43624, 43681, 43773, 44001, 44091, 44223, 44270, 44556, 45203, 45732, 46909, 46918, 46998, 47802, 48094, 502

In [19]:
import numpy as np


fraud_index = 541 


fraud_sample = data_scaled[fraud_index].reshape(1, -1)


fraud_recon = autoencoder.predict(fraud_sample)


fraud_mse = np.mean(np.power(fraud_sample - fraud_recon, 2))


if fraud_mse > threshold:
    print(f" Sample {fraud_index} detected as FRAUD (MSE={fraud_mse:.6f}, threshold={threshold:.6f})")
else:
    print(f" Sample {fraud_index} detected as NORMAL (MSE={fraud_mse:.6f}, threshold={threshold:.6f})")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
 Sample 541 detected as FRAUD (MSE=2.955570, threshold=1.748855)
