In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:

df = pd.read_csv('credit_card_fraud_dataset.csvdd')


In [10]:

df = df.drop('TransactionID', axis=1) 
df['TransactionDate'] = pd.to_datetime(df['TransactionDate'])
df['Hour'] = df['TransactionDate'].dt.hour
df['DayOfWeek'] = df['TransactionDate'].dt.dayofweek
df = df.drop('TransactionDate', axis=1)

In [11]:

cat_features = ['MerchantID', 'TransactionType', 'Location']
num_features = ['Amount', 'Hour', 'DayOfWeek']

In [12]:

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
])

In [13]:

X = df.drop('IsFraud', axis=1)
y = df['IsFraud'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [14]:

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [15]:

baseline_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
baseline_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
baseline_model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


<keras.src.callbacks.history.History at 0x28728b6f980>

In [16]:

y_pred_baseline = (baseline_model.predict(X_test) > 0.5).astype(int)
baseline_report = classification_report(y_test, y_pred_baseline, output_dict=True)

[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step


In [17]:

rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

In [18]:

rus_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
rus_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
rus_model.fit(X_train_rus, y_train_rus, epochs=10, batch_size=32, verbose=0)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


<keras.src.callbacks.history.History at 0x28734f3fc50>

In [19]:

y_pred_rus = (rus_model.predict(X_test) > 0.5).astype(int)
rus_report = classification_report(y_test, y_pred_rus, output_dict=True)

[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step


In [20]:

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [21]:

smote_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
smote_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
smote_model.fit(X_train_smote, y_train_smote, epochs=10, batch_size=32, verbose=0)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


<keras.src.callbacks.history.History at 0x28735d45af0>

In [22]:

y_pred_smote = (smote_model.predict(X_test) > 0.5).astype(int)
smote_report = classification_report(y_test, y_pred_smote, output_dict=True)

[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step


In [23]:

results = pd.DataFrame({
    'Model': ['Baseline (Imbalanced)', 'Undersampling (RUS)', 'SMOTE (Oversampling)'],
    'Precision (Fraud)': [
        baseline_report['1']['precision'],
        rus_report['1']['precision'],
        smote_report['1']['precision']
    ],
    'Recall (Fraud)': [
        baseline_report['1']['recall'],
        rus_report['1']['recall'],
        smote_report['1']['recall']
    ],
    'F1-Score (Fraud)': [
        baseline_report['1']['f1-score'],
        rus_report['1']['f1-score'],
        smote_report['1']['f1-score']
    ]
})

print(results.round(3))

print("\n🔍 **Analysis:**")
print("- **Baseline (Imbalanced):** Typically has high precision but very low recall (misses fraud cases).")
print("- **Undersampling (RUS):** Improves recall but may lose important majority-class patterns.")
print("- **SMOTE (Oversampling):** Best balance—improves recall while maintaining good precision.")
print("For fraud detection, **SMOTE is usually preferred** since catching fraud (high recall) is critical.")

                   Model  Precision (Fraud)  Recall (Fraud)  F1-Score (Fraud)
0  Baseline (Imbalanced)              0.018           0.005             0.008
1    Undersampling (RUS)              0.010           0.500             0.020
2   SMOTE (Oversampling)              0.000           0.000             0.000

🔍 **Analysis:**
- **Baseline (Imbalanced):** Typically has high precision but very low recall (misses fraud cases).
- **Undersampling (RUS):** Improves recall but may lose important majority-class patterns.
- **SMOTE (Oversampling):** Best balance—improves recall while maintaining good precision.
For fraud detection, **SMOTE is usually preferred** since catching fraud (high recall) is critical.
