In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, precision_score
from imblearn.over_sampling import SMOTE
import lightgbm as lgb

In [11]:
df = pd.read_csv("creditcard.csv")

In [12]:
# 2. Inspect columns
print("Columns in dataset:", df.columns.tolist())

if "Class" not in df.columns:
    raise ValueError("The dataset does not have a 'Class' column. Check your CSV.")


Columns in dataset: ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class']


In [13]:
# 3. Remove rows where target is NaN
nan_target_count = df["Class"].isnull().sum()
if nan_target_count > 0:
    print(f"Found {nan_target_count} NaN values in target column. Dropping them.")
    df = df.dropna(subset=["Class"])


Found 1 NaN values in target column. Dropping them.


In [14]:
# 4. Handle NaNs in features (optional: fill or drop)
if df.drop(columns=["Class"]).isnull().sum().sum() > 0:
    print("Found NaNs in features. Filling them with median values.")
    df = df.fillna(df.median())

In [15]:

X = df.drop("Class", axis=1)
y = df["Class"]


In [16]:

# Train-Test data split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [17]:
# Apply SMOTE (only to training data to prevent data leakage)
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("\nAfter SMOTE:")
print(y_train_res.value_counts())


After SMOTE:
Class
0.0    134500
1.0    134500
Name: count, dtype: int64


In [18]:
# 5. LightGBM Model
lgbm = lgb.LGBMClassifier(
    objective="binary",
    boosting_type="gbdt",
    metric="binary_logloss",
    num_leaves=31,
    learning_rate=0.05,
    n_estimators=200,
    scale_pos_weight=1
)

In [19]:
# 6. Training the Data
lgbm.fit(X_train_res, y_train_res)

[LightGBM] [Info] Number of positive: 134500, number of negative: 134500
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.094679 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 269000, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [20]:
# 7. Predictions
y_pred = lgbm.predict(X_test)

In [21]:
# 8. Evaluation Metrices
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))

precision = precision_score(y_test, y_pred)
print(f"Precision: {precision:.4f}")



Confusion Matrix:
[[33615    10]
 [   14    58]]

Classification Report:
              precision    recall  f1-score   support

         0.0     0.9996    0.9997    0.9996     33625
         1.0     0.8529    0.8056    0.8286        72

    accuracy                         0.9993     33697
   macro avg     0.9263    0.9026    0.9141     33697
weighted avg     0.9993    0.9993    0.9993     33697

Precision: 0.8529


In [22]:
# Note: High precision means fewer false positives — suitable when false alarms are costly