# 1. IMPORTING LIBRARIES & SETUP

In [33]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 2. DATA LOADING & EXPLORATORY ANALYSIS (EDA)

In [34]:
# Load the dataset
data = pd.read_csv('/content/creditcard.csv')

In [35]:
# Basic Info
print("--- Dataset Information ---")
print(f"Dataset Shape: {data.shape}")
print(data.info())

--- Dataset Information ---
Dataset Shape: (13954, 31)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13954 entries, 0 to 13953
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    13954 non-null  int64  
 1   V1      13954 non-null  float64
 2   V2      13954 non-null  float64
 3   V3      13954 non-null  float64
 4   V4      13954 non-null  float64
 5   V5      13954 non-null  float64
 6   V6      13954 non-null  float64
 7   V7      13954 non-null  float64
 8   V8      13954 non-null  float64
 9   V9      13954 non-null  float64
 10  V10     13954 non-null  float64
 11  V11     13954 non-null  float64
 12  V12     13954 non-null  float64
 13  V13     13954 non-null  float64
 14  V14     13954 non-null  float64
 15  V15     13954 non-null  float64
 16  V16     13954 non-null  float64
 17  V17     13954 non-null  float64
 18  V18     13954 non-null  float64
 19  V19     13954 non-null  float64
 20  V20     13954 non

In [36]:
# Checking Imbalance
print("\n--- Class Distribution (Original) ---")
print(data['Class'].value_counts())


--- Class Distribution (Original) ---
Class
0.0    13897
1.0       56
Name: count, dtype: int64


In [37]:
#Statistical Insight: Compare average values for Legit vs Fraud
# This proves to us that fraud transactions behave differently!
print("\n--- Statistical Comparison (Mean Values) ---")
print(data.groupby('Class').mean()[['Amount', 'V1', 'V2', 'V3']])


--- Statistical Comparison (Mean Values) ---
          Amount        V1        V2        V3
Class                                         
0.0    63.773909 -0.217093  0.266252  0.887227
1.0    90.815893 -4.727948  4.660436 -9.328536


# 3. PREPROCESSING (UNDERSAMPLING)

In [38]:
# Separate data for sampling
legit = data[data.Class == 0]
fraud = data[data.Class == 1]

print(f"\nLegit Transactions count: {legit.shape[0]}")
print(f"Fraud Transactions count: {fraud.shape[0]}")

# Create a sample of legit transactions matching the fraud count (56)
legit_sample = legit.sample(n=56, random_state=2)

# Concatenate to create the Balanced Dataset
new_dataset = pd.concat([legit_sample, fraud], axis=0)

print("\n--- New Balanced Dataset Distribution ---")
print(new_dataset['Class'].value_counts())


Legit Transactions count: 13897
Fraud Transactions count: 56

--- New Balanced Dataset Distribution ---
Class
0.0    56
1.0    56
Name: count, dtype: int64


# 4. SPLITTING DATA (TRAIN / TEST)

In [39]:
# X = Features (all columns except Class)
# Y = Target (Class column)
X = new_dataset.drop(columns="Class", axis=1)
Y = new_dataset["Class"]

# Split data: 80% Training, 20% Testing
# stratify=Y ensures both Train and Test have equal ratio of Fraud/Legit
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# 5. MODEL TRAINING

In [43]:
# max_iter=3000 ensures the solver has enough time to converge
model = LogisticRegression(max_iter=3000)
model.fit(X_train, Y_train)

# 6. EVALUATION

In [41]:
# Evaluate on Training Data
train_prediction = model.predict(X_train)
train_acc = accuracy_score(train_prediction, Y_train)

# Evaluate on Test Data
test_prediction = model.predict(X_test)
test_acc = accuracy_score(test_prediction, Y_test)

print("\n--- Model Performance ---")
print(f"Training Accuracy: {train_acc * 100:.2f}%")
print(f"Testing Accuracy : {test_acc * 100:.2f}%")


--- Model Performance ---
Training Accuracy: 98.88%
Testing Accuracy : 100.00%


# 7. PREDICTION SYSTEM (DEPLOYMENT READY)

In [42]:
def predict_fraud(input_data):
    """
    Takes raw transaction data and returns prediction.
    """
    # Convert to numpy array
    input_data_as_numpy_array = np.asarray(input_data)

    # Reshape because model expects a batch (1 instance, many features)
    input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

    prediction = model.predict(input_data_reshaped)

    if prediction[0] == 0:
        return "âœ… Legit Transaction"
    else:
        return "ðŸš¨ FRAUD DETECTED!"

# --- Test the system with a real example from our data ---
print("\n--- Testing Prediction System ---")

# Let's pick a random fraud case from the original data to test
sample_fraud = data[data['Class'] == 1].sample(1).drop(columns='Class').values[0]

print(f"Testing with known Fraud case...")
print(f"Result: {predict_fraud(sample_fraud)}")


--- Testing Prediction System ---
Testing with known Fraud case...
Result: ðŸš¨ FRAUD DETECTED!


