In [1]:
# 1. Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [2]:
# 2. Load the Dataset
df = pd.read_csv("C://Users//yamba//OneDrive//Desktop//project//data//creditcard.csv")

In [3]:
# 3. Feature Selection
# Dropping the 'Time' column as it's not relevant for the model (unless you want to try using it)
df = df.drop(['Time'], axis=1)

In [4]:
# 4. Split into Features and Target Variable
X = df.drop(['Class'], axis=1)  # All features except 'Class'
y = df['Class']  # Target variable

In [5]:
# 5. Handle Class Imbalance using SMOTE or Random Undersampling
# Option 1: SMOTE (Synthetic Minority Over-sampling Technique)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [6]:
print("\nClass distribution after resampling:")
print(pd.Series(y_resampled).value_counts())


Class distribution after resampling:
Class
0    284315
1    284315
Name: count, dtype: int64


In [7]:
# 6. Feature Scaling (Standardization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)

In [8]:
# 7. Train-Test Split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_resampled, test_size=0.2, random_state=42)

In [9]:
print("\nTrain and Test Data Shapes:")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


Train and Test Data Shapes:
X_train shape: (454904, 29)
X_test shape: (113726, 29)
y_train shape: (454904,)
y_test shape: (113726,)


In [10]:
# Save processed data (Optional)
np.save('X_train.npy', X_train)
np.save('X_test.npy', X_test)
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)