In [1]:
# Ignore warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')# Import necessary libraries
import numpy as np # For numerical operations
import pandas as pd # For data manipulation and analysis
import matplotlib.pyplot as plt # For data visualization
import seaborn as sns # For enhanced data visualization
from scipy.stats import zscore # For z-score calculation
from sklearn.model_selection import train_test_split # For splitting data into training and testing sets
from sklearn.preprocessing import StandardScaler # For standardizing numerical features
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA # For dimensionality reduction using LDA
from sklearn.svm import SVC # For Support Vector Machine classification
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score # For model evaluation
from imblearn.over_sampling import SMOTE # For handling class imbalance

In [2]:
df = pd.read_csv("creditcard.csv")

# data preprosssecing 

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [4]:
scaler = StandardScaler()
df['Amount'] = scaler.fit_transform(pd.DataFrame(df['Amount']))

In [5]:
df = df.drop(['Time'], axis=1)

In [6]:
if df.duplicated().any():    
    # Remove duplicates
    df = df.drop_duplicates()
else:
    print("No Duplicates Found.")

In [7]:
df['Class'].value_counts()

Class
0    275190
1       473
Name: count, dtype: int64

this type of data could make overfiting

In [8]:
df.describe()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,275663.0,275663.0,275663.0,275663.0,275663.0,275663.0,275663.0,275663.0,275663.0,275663.0,...,275663.0,275663.0,275663.0,275663.0,275663.0,275663.0,275663.0,275663.0,275663.0,275663.0
mean,-0.03746,-0.00243,0.02552,-0.004359,-0.01066,-0.014206,0.008586,-0.005698,-0.012363,0.003114,...,0.002579,0.005827,-0.001941,-0.006868,-0.004812,-0.00024,0.001921,0.000904,0.008911,0.001716
std,1.952522,1.66726,1.507538,1.424323,1.378117,1.313213,1.240348,1.191596,1.100108,1.087025,...,0.733089,0.726378,0.631451,0.60555,0.524175,0.484139,0.401271,0.332649,1.012371,0.041388
min,-56.40751,-72.715728,-48.325589,-5.683171,-113.743307,-26.160506,-43.557242,-73.216718,-13.434066,-24.588262,...,-34.830382,-10.933144,-44.807735,-2.836627,-10.295397,-2.604551,-22.565679,-15.430084,-0.353229,0.0
25%,-0.941105,-0.61404,-0.843168,-0.862847,-0.700192,-0.765861,-0.552047,-0.209618,-0.659904,-0.538968,...,-0.225021,-0.532173,-0.16544,-0.361062,-0.323597,-0.32829,-0.071729,-0.052654,-0.328041,0.0
50%,-0.059659,0.070249,0.200736,-0.035098,-0.060556,-0.270931,0.044848,0.02298,-0.064724,-0.091752,...,-0.025637,0.013397,-0.013655,0.037569,0.009909,-0.056667,0.002615,0.011788,-0.258315,0.0
75%,1.294471,0.819067,1.048461,0.753943,0.604521,0.387704,0.583885,0.322319,0.593098,0.470702,...,0.189118,0.534272,0.145482,0.432931,0.347151,0.244196,0.09473,0.081355,-0.033742,0.0
max,2.45493,22.057729,9.382558,16.875344,34.801666,73.301626,120.589494,20.007208,15.594995,23.745136,...,27.202839,10.50309,22.528412,4.584549,7.519589,3.517346,31.612198,33.847808,102.362243,1.0


# Dimensionality Reduction

## Step 1: Separate features and target variable¶


In [9]:
X = df.drop('Class', axis=1)  # Features
y = df['Class']  # Target variable

## Step 2: Split Data into Training and Testing Sets

In [10]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Step 3: Standardize featuresensures that all features have the same scale.

In [11]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Step 4:Address Class Imbalance with SMOTE
agenerating  samples using SMOTE applyed SMOTE for the lake and umbalanced data 

In [12]:
# Apply SMOTE to address class imbalance
print("\n Imbalance with SMOTE:")
smote = SMOTE(random_state=0)
X_train_over, y_train_over = smote.fit_resample(X_train, y_train)

# Display information before and after SMOTE
print("Before SMOTE: ", X_train.shape, y_train.shape)
print("After SMOTE: ", X_train_over.shape, y_train_over.shape)
print("After SMOTE (0,1): ", pd.Series(y_train_over).value_counts())


 Imbalance with SMOTE:
Before SMOTE:  (192964, 29) (192964,)
After SMOTE:  (385254, 29) (385254,)
After SMOTE (0,1):  Class
0    192627
1    192627
Name: count, dtype: int64


In [None]:

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.countplot(y_train, palette='Set1')
plt.title(' Distribution Before SMOTE')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()


In [None]:

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 2)
sns.countplot(y_train_over, palette='Set2')
plt.title(' Distribution After SMOTE')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()


# apply LDA

In [13]:
# Perform LDA for dimensionality reduction
lda = LDA(n_components=1)
X_train_lda = lda.fit_transform(X_train_over, y_train_over)
X_test_lda = lda.transform(X_test)

In [15]:
# Train SVM Model with Hyperparameter Tuning
svm_model = SVC(kernel='rbf', C=1, gamma='scale')
svm_model.fit(X_train_over, y_train_over)

In [16]:
y_pred = svm_model.predict(X_test)


In [17]:
# Evaluate Model Performance
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[81320  1243]
 [   18   118]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     82563
           1       0.09      0.87      0.16       136

    accuracy                           0.98     82699
   macro avg       0.54      0.93      0.57     82699
weighted avg       1.00      0.98      0.99     82699



In [18]:
# Calculate ROC AUC Score
roc_auc = roc_auc_score(y_test, y_pred)
print("\nROC AUC Score:", roc_auc)


ROC AUC Score: 0.9262959444160644
