# Library Installation

In [16]:
pip install imbalanced-learn


Note: you may need to restart the kernel to use updated packages.


# Data Loading and Exploration

In [17]:
import pandas as pd

# Load the dataset
data_path = 'creditcard.csv'
df = pd.read_csv(data_path)

# Display the first few rows of the dataframe
df.head()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


# Data Preprocessing

In [18]:
# Check for missing values
print(df.isnull().sum())

# Display basic statistics
df.describe()

# Check class distribution
print(df['Class'].value_counts())


Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64
Class
0    284315
1       492
Name: count, dtype: int64


# Data Splitting

In [19]:
from sklearn.model_selection import train_test_split

# Split the data into features and target variable
X = df.drop('Class', axis=1)
y = df['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Model Building

In [20]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training data, and transform the test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


#  Retraining the Models

In [21]:
# Initialize the models with increased iterations for logistic regression
lr = LogisticRegression(max_iter=2000)
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the models on the scaled data
lr.fit(X_train_scaled, y_train)
rf.fit(X_train_scaled, y_train)


 # Evaluating the Models

In [22]:
from sklearn.metrics import classification_report, confusion_matrix

# Predictions on the test set
y_pred_lr = lr.predict(X_test_scaled)
y_pred_rf = rf.predict(X_test_scaled)

# Evaluation
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_lr))
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))

print("Logistic Regression Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("Random Forest Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.83      0.63      0.72        98

    accuracy                           1.00     56962
   macro avg       0.91      0.82      0.86     56962
weighted avg       1.00      1.00      1.00     56962

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.94      0.82      0.87        98

    accuracy                           1.00     56962
   macro avg       0.97      0.91      0.94     56962
weighted avg       1.00      1.00      1.00     56962

Logistic Regression Confusion Matrix:
 [[56851    13]
 [   36    62]]
Random Forest Confusion Matrix:
 [[56859     5]
 [   18    80]]


# Handling Class Imbalance

In [23]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# Train the models on the resampled and scaled data
lr.fit(X_train_smote, y_train_smote)
rf.fit(X_train_smote, y_train_smote)

# Predictions on the original scaled test set
y_pred_lr_smote = lr.predict(X_test_scaled)
y_pred_rf_smote = rf.predict(X_test_scaled)

# Evaluation
print("Logistic Regression with SMOTE Classification Report:\n", classification_report(y_test, y_pred_lr_smote))
print("Random Forest with SMOTE Classification Report:\n", classification_report(y_test, y_pred_rf_smote))

print("Logistic Regression with SMOTE Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr_smote))
print("Random Forest with SMote Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf_smote))


Logistic Regression with SMOTE Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.99     56864
           1       0.06      0.92      0.11        98

    accuracy                           0.97     56962
   macro avg       0.53      0.95      0.55     56962
weighted avg       1.00      0.97      0.99     56962

Random Forest with SMOTE Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.87      0.83      0.85        98

    accuracy                           1.00     56962
   macro avg       0.94      0.91      0.92     56962
weighted avg       1.00      1.00      1.00     56962

Logistic Regression with SMOTE Confusion Matrix:
 [[55397  1467]
 [    8    90]]
Random Forest with SMote Confusion Matrix:
 [[56852    12]
 [   17    81]]
