In [84]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append("..")

 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
 
# Import project helpers
from src.data_loading import load_transactions, load_products
from src.preprocessing_transactions import get_transactions_dataset
from src.preprocessing_products import get_products_dataset
from src.evaluation import evaluate_classifier

In [85]:
# Load the transactions data
df_products = load_products()
df_products.info()
df_products.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   product_id                5000 non-null   object 
 1   seller_id                 5000 non-null   object 
 2   category                  5000 non-null   object 
 3   brand                     5000 non-null   object 
 4   price                     5000 non-null   float64
 5   seller_rating             5000 non-null   float64
 6   seller_reviews            5000 non-null   int64  
 7   product_images            5000 non-null   int64  
 8   description_length        5000 non-null   int64  
 9   shipping_time_days        5000 non-null   int64  
 10  spelling_errors           5000 non-null   int64  
 11  domain_age_days           5000 non-null   int64  
 12  contact_info_complete     5000 non-null   bool   
 13  return_policy_clear       5000 non-null   bool   
 14  payment_

Unnamed: 0,price,seller_rating,seller_reviews,product_images,description_length,shipping_time_days,spelling_errors,domain_age_days,payment_methods_count,views,purchases,wishlist_adds,certification_badges,warranty_months
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,224.869204,3.73298,3529.8092,4.6862,463.564,13.4622,1.0934,1456.1054,3.254,2469.2866,123.0508,39.1696,2.0264,11.562
std,136.741376,0.995313,3281.418979,2.651629,294.29469,11.5496,1.160752,1143.264222,1.357296,1424.525316,107.889725,28.945971,1.405453,6.882037
min,10.12,1.0,0.0,1.0,50.0,1.0,0.0,1.0,1.0,10.0,0.0,0.0,0.0,0.0
25%,109.7875,3.5,85.0,2.0,179.0,5.0,0.0,307.5,2.0,1245.75,34.0,13.0,1.0,6.0
50%,191.815,4.0,2922.5,5.0,433.5,10.0,1.0,1319.0,3.0,2433.0,93.0,34.0,2.0,12.0
75%,336.615,4.5,6401.5,7.0,725.25,18.0,1.0,2465.0,4.0,3701.5,185.0,62.0,3.0,18.0
max,499.84,5.0,9996.0,9.0,999.0,44.0,4.0,3649.0,5.0,4996.0,491.0,99.0,4.0,23.0


In [86]:
print(df_products[feature_cols + ['is_counterfeit']].corr()['is_counterfeit'].sort_values())

payment_methods_count   -0.841770
seller_rating           -0.809984
product_images          -0.772673
description_length      -0.741478
domain_age_days         -0.718089
seller_reviews          -0.684206
price                   -0.576817
wishlist_adds           -0.001506
warranty_months          0.001331
views                    0.004539
certification_badges     0.005683
purchases                0.005975
spelling_errors          0.773749
shipping_time_days       0.867171
is_counterfeit           1.000000
Name: is_counterfeit, dtype: float64


In [87]:
#plotting

In [88]:
X_train, X_test, y_train, y_test, preprocessor = get_products_dataset()

# Define Logistic Regression classifier (linear model for classification)
logreg = LogisticRegression(
    max_iter=1000,   # increase iterations to ensure convergence
    random_state=42, # reproducible
    n_jobs=-1,       # use all cores (for some solvers)
    class_weight="balanced",
)

# Pipeline: preprocessing -> logistic regression
clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", logreg),
])

# Train the model
clf.fit(X_train, y_train)

# Evaluate on test data
evaluate_classifier(
    clf,
    X_test,
    y_test,
    model_name="logreg",
    dataset_name="products",
)


=== logreg on products ===
Accuracy : 0.9480
Precision: 0.8980
Recall   : 0.9286
F1-score : 0.9130
ROC-AUC  : 0.9879


In [89]:
#plotting

In [90]:

# Define the KNN classifier
knn = KNeighborsClassifier(
    n_neighbors=5,     # look at 5 nearest neighbors
    weights="distance",# closer neighbors have more influence
    n_jobs=-1,         # use all CPU cores (for distance computations)
)

# Pipeline: preprocessing -> KNN model
clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", knn),
])

# Train the model
clf.fit(X_train, y_train)

# Evaluate on the test set
evaluate_classifier(
    clf,
    X_test,
    y_test,
    model_name="knn",
    dataset_name="transactions",
)



=== knn on transactions ===
Accuracy : 0.8750
Precision: 0.8141
Recall   : 0.7449
F1-score : 0.7780
ROC-AUC  : 0.9043


In [91]:
#plotting

In [92]:

# Define the Gradient Boosting classifier
gb = GradientBoostingClassifier(
    random_state=42,  # reproducible
)

# Pipeline: preprocessing -> GB model
clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", gb),
])

# Train the model
clf.fit(X_train, y_train)

# Evaluate on the test set
evaluate_classifier(
    clf,
    X_test,
    y_test,
    model_name="gb",
    dataset_name="products",
)



=== gb on products ===
Accuracy : 0.9630
Precision: 0.9541
Recall   : 0.9184
F1-score : 0.9359
ROC-AUC  : 0.9906


In [93]:
#plotting


In [94]:

# Define the Random Forest classifier
rf = RandomForestClassifier(
    n_estimators=200,  # number of trees
    max_depth=None,    # allow trees to grow fully
    random_state=42,   # reproducible results
    n_jobs=-1,         # parallelize across CPU cores
    class_weight="balanced",
)

# Pipeline: preprocessing -> RF model
clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", rf),
])

# Train the model
clf.fit(X_train, y_train)

# Evaluate on the test set
evaluate_classifier(
    clf,
    X_test,
    y_test,
    model_name="rf",
    dataset_name="products",
)


=== rf on products ===
Accuracy : 0.9630
Precision: 0.9477
Recall   : 0.9252
F1-score : 0.9363
ROC-AUC  : 0.9896


In [95]:
#plotting

In [96]:

# Define SVM with RBF kernel
svm = SVC(
    kernel="rbf",       # non-linear classifier
    probability=True,   # enable predict_proba for ROC-AUC
    random_state=42,    # reproducible
    class_weight="balanced",
)

# Pipeline: preprocessing -> SVM model
clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", svm),
])

# Train the model
clf.fit(X_train, y_train)

# Evaluate on the test set
evaluate_classifier(
    clf,
    X_test,
    y_test,
    model_name="svm",
    dataset_name="products",
)



=== svm on products ===
Accuracy : 0.9340
Precision: 0.8701
Recall   : 0.9116
F1-score : 0.8904
ROC-AUC  : 0.9784


In [97]:
#plotting


In [98]:
# Random Forest model
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    class_weight="balanced",
    random_state=42
)

# Pipeline: preprocessing â†’ RandomForest
clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", rf),
])

# Train
clf.fit(X_train, y_train)

# Evaluate
evaluate_classifier(
    clf,
    X_test,
    y_test,
    model_name="random_forest",
    dataset_name="products",
)


=== random_forest on products ===
Accuracy : 0.9630
Precision: 0.9477
Recall   : 0.9252
F1-score : 0.9363
ROC-AUC  : 0.9896


In [None]:
#plotting

In [None]:
# ALSO NEED A COMPARISON OF ALL MODEL PERFOMANCES