In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


In [2]:
df = pd.read_csv("creditcard.csv") 

In [3]:
missing = df.isnull().sum()
print("Missing values per column:\n", missing)

Missing values per column:
 Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


In [5]:
scaler = StandardScaler()
df['scaled_amount'] = scaler.fit_transform(df[['Amount']])
df['scaled_time'] = scaler.fit_transform(df[['Time']])

In [7]:
df.drop(columns=['Amount', 'Time'], inplace=True)

In [8]:
scaled_columns = ['scaled_time', 'scaled_amount'] + [col for col in df.columns if col not in ['scaled_time', 'scaled_amount', 'Class']] + ['Class']
df = df[scaled_columns]

In [9]:
X = df.drop('Class', axis=1)
y = df['Class']


In [10]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # Ensures class distribution is preserved
)

In [11]:

print("Training set fraud ratio:", y_train.mean())
print("Test set fraud ratio:", y_test.mean())
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

Training set fraud ratio: 0.001729245759178389
Test set fraud ratio: 0.0017204452090867595
X_train shape: (227845, 30)
X_test shape: (56962, 30)


In [14]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("✅ Original class distribution:", y_train.value_counts().to_dict())
print("✅ Resampled class distribution:", y_train_resampled.value_counts().to_dict())


✅ Original class distribution: {0: 227451, 1: 394}
✅ Resampled class distribution: {0: 227451, 1: 227451}


In [15]:
# Train a Supervised Model (Random Forest)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_resampled, y_train_resampled)


In [16]:
# ----------------------
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

In [17]:
print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred, digits=4))


📊 Classification Report:
              precision    recall  f1-score   support

           0     0.9997    0.9998    0.9997     56864
           1     0.8617    0.8265    0.8438        98

    accuracy                         0.9995     56962
   macro avg     0.9307    0.9132    0.9217     56962
weighted avg     0.9995    0.9995    0.9995     56962



In [18]:
print("📉 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

📉 Confusion Matrix:
[[56851    13]
 [   17    81]]


In [19]:

print("🔥 ROC AUC Score:", roc_auc_score(y_test, y_prob))

🔥 ROC AUC Score: 0.9634900457087731


In [20]:
import joblib
joblib.dump(model, "credit_card_fraud_model.joblib")
print("✅ Model saved.")


✅ Model saved.
