In [2]:
import pandas as pd
from google.colab import files
import zipfile

# Upload zip file
uploaded = files.upload()

# Get uploaded filename
for fn in uploaded.keys():
    filename = fn
    print("Uploaded file:", filename)

# Unzip
with zipfile.ZipFile(filename, 'r') as zip_ref:
    zip_ref.extractall(".")

# Find extracted file
import os
for f in os.listdir("."):
    if f.endswith(".csv"):
        csv_file = f
        print("Extracted CSV:", csv_file)

# Load dataset
data = pd.read_csv(csv_file)

print("✅ Dataset loaded successfully")
print("Shape of dataset:", data.shape)
print(data.head())


Saving creditcard.csv.zip to creditcard.csv (1).zip
Uploaded file: creditcard.csv (1).zip
Extracted CSV: creditcard.csv
✅ Dataset loaded successfully
Shape of dataset: (284807, 31)
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 

In [5]:
# Step 2: Data Exploration

# 1. Dataset basic info
print("----- INFO -----")
print(data.info())    #DataFrame ka summary print karta hai: rows, columns, har column ka data type, aur non-null count.

# 2. Numeric columns summary
print("\n----- DESCRIBE -----")
print(data.describe())    #numeric columns ka statistical summary dikhata hai: count, mean, std, min, 25%, 50%, 75%, max.

# 3. First 5 rows (sample data)
print("\n----- HEAD -----")
print(data.head()) #pehli 5 rows dikhata hai. Isse tum column names aur ek sample transaction dekh sakte ho (verify karne ke liye ke sab sahi load hua hai).

# 4. Fraud vs Non-Fraud count
print("\n----- CLASS DISTRIBUTION -----")
print(data["Class"].value_counts()) #target column (Class) me 0 aur 1 ka count deta hai. Isse pata chalta hai kitne normal aur kitne fraud transactions hain

----- INFO -----
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float6

In [7]:
# ---------------- Step 3: Data Preprocessing ----------------

# Make a copy of the dataset so the original one stays safe
df = data.copy()

# 'Amount' and 'Time' are not scaled like other features,
# so we need to normalize them with StandardScaler
from sklearn.preprocessing import StandardScaler

scaler_amount = StandardScaler()
scaler_time = StandardScaler()

# Scale 'Amount' and create a new column
df['scaled_amount'] = scaler_amount.fit_transform(df[['Amount']])

# Scale 'Time' and create a new column
df['scaled_time'] = scaler_time.fit_transform(df[['Time']])

# Drop original 'Amount' and 'Time' because we now have the scaled versions
df.drop(['Amount', 'Time'], axis=1, inplace=True)

# Separate features (X) and target (y)
# X = all columns except 'Class'
# y = only 'Class' column (0 = normal, 1 = fraud)
X = df.drop('Class', axis=1)
y = df['Class']

# Split the data into training (80%) and testing (20%)
# stratify=y keeps the fraud vs non-fraud ratio balanced in both sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# Quick check: print shapes of training and test sets
print("Shapes -> X_train:", X_train.shape, ", X_test:", X_test.shape)
print("          y_train:", y_train.shape, ", y_test:", y_test.shape)

# Check class distribution in training set
print("\nTrain class counts:\n", y_train.value_counts())

# Check class distribution in test set
print("\nTest class counts:\n", y_test.value_counts())

# Also check percentages to confirm class ratio is same
print("\nTrain class percentages:\n", (y_train.value_counts(normalize=True)*100).round(4))
print("\nTest class percentages:\n", (y_test.value_counts(normalize=True)*100).round(4))


Shapes -> X_train: (227845, 30) , X_test: (56962, 30)
          y_train: (227845,) , y_test: (56962,)

Train class counts:
 Class
0    227451
1       394
Name: count, dtype: int64

Test class counts:
 Class
0    56864
1       98
Name: count, dtype: int64

Train class percentages:
 Class
0    99.8271
1     0.1729
Name: proportion, dtype: float64

Test class percentages:
 Class
0    99.828
1     0.172
Name: proportion, dtype: float64


In [8]:
# Step 4: Train a Machine Learning model (Logistic Regression)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Create the Logistic Regression model
# - 'max_iter=1000' makes sure it runs enough iterations to converge
# - 'class_weight="balanced"' handles the imbalance between fraud (1) and non-fraud (0)
log_reg = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42)

# Train (fit) the model on training data
log_reg.fit(X_train, y_train)

# Make predictions on the test data
y_pred = log_reg.predict(X_test)

# Print evaluation results
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))


Confusion Matrix:
[[55475  1389]
 [    8    90]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9999    0.9756    0.9876     56864
           1     0.0609    0.9184    0.1141        98

    accuracy                         0.9755     56962
   macro avg     0.5304    0.9470    0.5509     56962
weighted avg     0.9982    0.9755    0.9861     56962

