In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append("..")

 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
 
# Import project helpers
from src.data_loading import load_transactions, load_products
from src.preprocessing_transactions import get_transactions_dataset
from src.preprocessing_products import get_products_dataset
from src.evaluation import evaluate_classifier

In [10]:
# Load the transactions data
df_transactions = load_transactions()
df_transactions.info()
df_transactions.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   transaction_id           3000 non-null   object 
 1   customer_id              3000 non-null   object 
 2   transaction_date         3000 non-null   object 
 3   customer_age             3000 non-null   int64  
 4   customer_location        3000 non-null   object 
 5   quantity                 3000 non-null   int64  
 6   unit_price               3000 non-null   float64
 7   total_amount             3000 non-null   float64
 8   payment_method           3000 non-null   object 
 9   shipping_speed           3000 non-null   object 
 10  customer_history_orders  3000 non-null   int64  
 11  discount_applied         3000 non-null   bool   
 12  discount_percentage      3000 non-null   float64
 13  shipping_cost            3000 non-null   float64
 14  delivery_time_days      

Unnamed: 0,customer_age,quantity,unit_price,total_amount,customer_history_orders,discount_percentage,shipping_cost,delivery_time_days
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,48.665667,4.372333,133.781473,431.415573,19.355667,8.5731,14.87478,15.046333
std,17.899889,4.391901,85.403283,331.140587,15.652247,14.689838,5.658677,8.320813
min,18.0,1.0,5.2,6.28,0.0,0.0,5.0,1.0
25%,33.0,2.0,60.145,171.685,4.0,0.0,9.95,8.0
50%,48.0,3.0,112.465,322.585,18.0,0.0,14.78,15.0
75%,64.0,4.0,210.3725,623.16,33.0,13.7,19.765,22.0
max,79.0,19.0,299.59,1865.66,49.0,50.0,24.99,29.0


In [None]:
# Plotting

In [None]:
X_train, X_test, y_train, y_test, preprocessor = get_transactions_dataset()

# Define the Logistic Regression classifier (linear model for binary classification)
logreg = LogisticRegression(
    max_iter=1000,   # allow more iterations so the optimizer can converge
    random_state=42, # make results reproducible
    n_jobs=-1,       # use all available CPU cores (for solvers that support it)
    class_weight="balanced",

#For LogReg and SVM, you now see:
#Recall = 1.0 → they catch all counterfeit transactions in the test set
#Precision dipped a bit (~0.967) → they flag a few more false positives
#That’s exactly what “balanced” does: it makes the model care more about the minority class, pushing for higher recall.
)

# Build a Pipeline:
# 1) "preprocess" step: applies scaling + one-hot encoding to features
# 2) "model" step: fits Logistic Regression on the transformed features
clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", logreg),
])

# Train the pipeline on the training data
clf.fit(X_train, y_train)

# Evaluate the trained model on the test data
evaluate_classifier(
    clf,
    X_test,
    y_test,
    model_name="logreg",
    dataset_name="transactions",
)


=== logreg on transactions ===
Accuracy : 0.9917
Precision: 0.9671
Recall   : 1.0000
F1-score : 0.9833
ROC-AUC  : 0.9999
