In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, confusion_matrix


In [2]:
df = pd.read_csv('creditcard.csv')

# View first rows
df.head()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [3]:
# Check basic information about the dataset
# This helps us understand number of rows, columns, and data types
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128821 entries, 0 to 128820
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    128821 non-null  int64  
 1   V1      128821 non-null  float64
 2   V2      128821 non-null  float64
 3   V3      128821 non-null  float64
 4   V4      128821 non-null  float64
 5   V5      128821 non-null  float64
 6   V6      128821 non-null  float64
 7   V7      128821 non-null  float64
 8   V8      128821 non-null  float64
 9   V9      128821 non-null  float64
 10  V10     128821 non-null  float64
 11  V11     128821 non-null  float64
 12  V12     128821 non-null  float64
 13  V13     128821 non-null  float64
 14  V14     128821 non-null  float64
 15  V15     128821 non-null  float64
 16  V16     128820 non-null  float64
 17  V17     128820 non-null  float64
 18  V18     128820 non-null  float64
 19  V19     128820 non-null  float64
 20  V20     128820 non-null  float64
 21  V21     12

In [4]:
# Check how many normal and fraud transactions are present
# Class = 0 → Normal transaction
# Class = 1 → Fraud transaction
df['Class'].value_counts()


Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,128559
1.0,261


In [5]:
# X contains all the input features (transaction details)
X = df.drop('Class', axis=1)

# y contains the target column (0 = normal, 1 = fraud)
y = df['Class']


In [7]:
# Check if target column has any missing values
y.isnull().sum()


np.int64(1)

In [8]:
# Combine X and y temporarily to remove rows with missing target
df_clean = pd.concat([X, y], axis=1)

# Drop rows where Class value is missing
df_clean = df_clean.dropna(subset=['Class'])

# Separate features and target again
X = df_clean.drop('Class', axis=1)
y = df_clean['Class']


In [9]:
# Confirm there are no missing values now
y.isnull().sum()


np.int64(0)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Sometimes datasets may contain missing labels, which must be removed before model training



In [12]:
# Logistic Regression is used as a baseline model
# It helps us understand how a simple model performs on this data

from sklearn.linear_model import LogisticRegression

# Create the model
# max_iter is increased so the model can converge properly
lr_model = LogisticRegression(max_iter=1000)

# Train the model using training data
lr_model.fit(X_train, y_train)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
# Predict labels for test data
y_pred_lr = lr_model.predict(X_test)


In [14]:
# Import evaluation metrics
from sklearn.metrics import confusion_matrix, classification_report

# Confusion matrix shows correct and incorrect predictions
print("Confusion Matrix - Logistic Regression")
print(confusion_matrix(y_test, y_pred_lr))


Confusion Matrix - Logistic Regression
[[25705     7]
 [   28    24]]


In [15]:
# Classification report shows precision, recall and F1-score
# Recall for fraud class (1) is more important than accuracy
print("Classification Report - Logistic Regression")
print(classification_report(y_test, y_pred_lr))


Classification Report - Logistic Regression
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     25712
         1.0       0.77      0.46      0.58        52

    accuracy                           1.00     25764
   macro avg       0.89      0.73      0.79     25764
weighted avg       1.00      1.00      1.00     25764



In [16]:
# Random Forest works better for complex and imbalanced datasets

from sklearn.ensemble import RandomForestClassifier

# Create the model
# class_weight='balanced' helps handle class imbalance
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced'
)

# Train the model
rf_model.fit(X_train, y_train)


In [17]:
# Predict labels using Random Forest
y_pred_rf = rf_model.predict(X_test)


In [18]:
# Confusion matrix for Random Forest
print("Confusion Matrix - Random Forest")
print(confusion_matrix(y_test, y_pred_rf))


Confusion Matrix - Random Forest
[[25710     2]
 [   10    42]]


In [19]:
# Classification report for Random Forest
print("Classification Report - Random Forest")
print(classification_report(y_test, y_pred_rf))


Classification Report - Random Forest
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     25712
         1.0       0.95      0.81      0.88        52

    accuracy                           1.00     25764
   macro avg       0.98      0.90      0.94     25764
weighted avg       1.00      1.00      1.00     25764



In [20]:
print("Logistic Regression Performance:")
print(classification_report(y_test, y_pred_lr))

print("\nRandom Forest Performance:")
print(classification_report(y_test, y_pred_rf))


Logistic Regression Performance:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     25712
         1.0       0.77      0.46      0.58        52

    accuracy                           1.00     25764
   macro avg       0.89      0.73      0.79     25764
weighted avg       1.00      1.00      1.00     25764


Random Forest Performance:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     25712
         1.0       0.95      0.81      0.88        52

    accuracy                           1.00     25764
   macro avg       0.98      0.90      0.94     25764
weighted avg       1.00      1.00      1.00     25764



In [21]:
# Select one transaction from test data
sample_transaction = X_test.iloc[0]


In [22]:
# Predict fraud or normal for this transaction
prediction = rf_model.predict([sample_transaction])




In [23]:
# Convert prediction to readable output
if prediction[0] == 1:
    print("⚠️ Fraudulent Transaction Detected")
else:
    print("✅ Normal Transaction")


✅ Normal Transaction


In [24]:
# Test model on first 10 transactions
test_predictions = rf_model.predict(X_test[:10])

# Show predictions
for i, pred in enumerate(test_predictions):
    result = "Fraud" if pred == 1 else "Normal"
    print(f"Transaction {i+1}: {result}")


Transaction 1: Normal
Transaction 2: Normal
Transaction 3: Normal
Transaction 4: Normal
Transaction 5: Normal
Transaction 6: Normal
Transaction 7: Normal
Transaction 8: Normal
Transaction 9: Normal
Transaction 10: Normal


In [25]:
# Get probability of fraud
fraud_probability = rf_model.predict_proba([sample_transaction])

print("Probability of Normal Transaction:", fraud_probability[0][0])
print("Probability of Fraud Transaction:", fraud_probability[0][1])


Probability of Normal Transaction: 1.0
Probability of Fraud Transaction: 0.0


