In [1]:
# =========================================================
# 1. Imports
# =========================================================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_classif

from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt

In [2]:
# =========================================================
# 2. Load dataset
# =========================================================
df = pd.read_csv("data/online_shoppers_intention.csv")
print("Dataset loaded:", df.shape)

Dataset loaded: (12330, 18)


In [3]:
# =========================================================
# 3. Convert boolean variables to numeric
# =========================================================
df['Revenue'] = df['Revenue'].astype(int)
df['Weekend'] = df['Weekend'].astype(int)


# =========================================================
# 4. Group rare categories in numeric-categorical features
# =========================================================
numeric_categorical = ['OperatingSystems', 'Browser', 'Region', 'TrafficType']

def group_rare(series, threshold=50):
    freq = series.value_counts()
    return series.apply(lambda x: x if freq[x] > threshold else "Other")

for col in numeric_categorical:
    df[col] = df[col].astype(str)
    df[col] = group_rare(df[col])


# =========================================================
# 5. Define categorical & numerical sets
# =========================================================
categorical_nominal = ['Month', 'VisitorType'] + numeric_categorical
numerical_features_original = [
    'Administrative', 'Administrative_Duration',
    'Informational', 'Informational_Duration',
    'ProductRelated', 'ProductRelated_Duration',
    'BounceRates', 'ExitRates', 'PageValues',
    'SpecialDay'
]


# =========================================================
# 6. Train-test split (BEFORE encoding!)
# =========================================================
X = df.drop('Revenue', axis=1)
y = df['Revenue']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train:", X_train.shape, " Test:", X_test.shape)


# =========================================================
# 7. Log-transform duration columns
# =========================================================
for col in ['Administrative_Duration', 'Informational_Duration', 'ProductRelated_Duration']:
    X_train[col] = np.log1p(X_train[col])
    X_test[col] = np.log1p(X_test[col])


# =========================================================
# 8. Fit OneHotEncoder on TRAIN split
# =========================================================
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
encoder.fit(X_train[categorical_nominal])

# Transform
X_train_cat = encoder.transform(X_train[categorical_nominal])
X_test_cat  = encoder.transform(X_test[categorical_nominal])

X_train = X_train.drop(columns=categorical_nominal)
X_test = X_test.drop(columns=categorical_nominal)

# Get feature names (optional)
encoded_cols = encoder.get_feature_names_out(categorical_nominal)


# =========================================================
# Define numerical columns AUTOMATICALLY
# =========================================================
numerical_features = X_train.select_dtypes(include=[np.number]).columns

X_train_num = X_train[numerical_features].reset_index(drop=True)
X_test_num  = X_test[numerical_features].reset_index(drop=True)


X_train_full = np.hstack([X_train_num.values, X_train_cat])
X_test_full  = np.hstack([X_test_num.values,  X_test_cat])



# =========================================================
# 10. Scale numerical + encoded features
# =========================================================
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_full)
X_test_scaled  = scaler.transform(X_test_full)


# =========================================================
# 11. PCA (for visualization only)
# =========================================================
pca = PCA(n_components=2)
pca.fit(X_train_scaled)

X_train_pca = pca.transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# =========================================================
# 12. SMOTE (APPLY ONLY ON SCALED DATA, ONLY FOR TRAINING MODELS THAT NEED IT)
# =========================================================
# Example: only for tree models, not for LogisticRegression

smote = SMOTE(random_state=0)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

print("After SMOTE:", X_train_smote.shape)


Train: (9864, 17)  Test: (2466, 17)
After SMOTE: (16676, 55)


  C = X.T @ X
  C = X.T @ X
  C = X.T @ X
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T


# Naive Bayes Classifier

In [27]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialize and train the model
nb = GaussianNB()
nb.fit(X_train_smote, y_train_smote)

# Make predictions
y_pred = nb.predict(X_test_scaled)

print("LDA Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

LDA Accuracy: 0.5275750202757502
Confusion Matrix:
 [[ 983 1101]
 [  64  318]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.47      0.63      2084
           1       0.22      0.83      0.35       382

    accuracy                           0.53      2466
   macro avg       0.58      0.65      0.49      2466
weighted avg       0.83      0.53      0.59      2466



In [14]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(GaussianNB(), X_train_smote, y_train_smote, cv=10, scoring='f1_macro')
print("Mean F1 (CV):", cv_scores.mean())

Mean F1 (CV): 0.68149933733642


With smote we achieve a 68,15% of average F1-Score Macro, which is the average of F1 for class 0 and F1 for class 1

In [28]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
import numpy as np

kf = KFold(n_splits=25, random_state=1, shuffle=True)

cvScores = []
i = 1

for train_index, test_index in kf.split(X_train_smote):
    print(f"\nFold {i} =============================================================")
    
    X_train_cv, X_val_cv = X_train_smote[train_index], X_train_smote[test_index]
    y_train_cv, y_val_cv = y_train_smote.iloc[train_index], y_train_smote.iloc[test_index]
    
    nb = GaussianNB()
    nb.fit(X_train_cv, y_train_cv)
    
    y_pred = nb.predict(X_val_cv)
    
    f1 = f1_score(y_val_cv, y_pred, pos_label=1)
    print(f"F1-score (class 1): {f1:.4f}")
    
    cvScores.append(f1)
    i += 1

print("\nMean F1:", np.mean(cvScores))
print("Std F1:", np.std(cvScores))



F1-score (class 1): 0.7393

F1-score (class 1): 0.7485

F1-score (class 1): 0.7473

F1-score (class 1): 0.7762

F1-score (class 1): 0.7341

F1-score (class 1): 0.7428

F1-score (class 1): 0.7199

F1-score (class 1): 0.7551

F1-score (class 1): 0.7711

F1-score (class 1): 0.7316

F1-score (class 1): 0.7370

F1-score (class 1): 0.7696

F1-score (class 1): 0.7255

F1-score (class 1): 0.7395

F1-score (class 1): 0.7471

F1-score (class 1): 0.7527

F1-score (class 1): 0.7466

F1-score (class 1): 0.7363

F1-score (class 1): 0.7481

F1-score (class 1): 0.7471

F1-score (class 1): 0.7132

F1-score (class 1): 0.7301

F1-score (class 1): 0.7543

F1-score (class 1): 0.7659

F1-score (class 1): 0.7512

Mean F1: 0.7452026053148049
Std F1: 0.015196194293010074


Result: optimistic performance (74 %) because the validation sets are as balanced and synthetic as the training data.

⬇️ REALISTIC

In [29]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB

pipeline = Pipeline([
    ('smote', SMOTE(random_state=0)),
    ('nb', GaussianNB())
])

cv_scores = cross_val_score(
    pipeline, X_train_scaled, y_train,  # original, imbalanced data
    cv=10, scoring='f1'
)

print("Mean F1 (CV realistic):", cv_scores.mean())

Mean F1 (CV realistic): 0.37070126721954005




## 1.1. Naive Bayes with Bernouilli Distribution

In [33]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB

pipeline = Pipeline([
    ('smote', SMOTE(random_state=0)),
    ('nb', BernoulliNB(alpha=10))
])

cv_scores = cross_val_score(
    pipeline, X_train_scaled, y_train,  # original, imbalanced data
    cv=20, scoring='f1'
)

print("Mean F1 (CV realistic):", cv_scores.mean())

  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret 

Mean F1 (CV realistic): 0.5186513293203316


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


In [31]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

params = {'alpha': [0.001, 0.01, 0.1, 1, 5, 10]}
grid = GridSearchCV(BernoulliNB(), params, cv=5, scoring='f1')
grid.fit(X_train, y_train)
print("Best alpha:", grid.best_params_, "Best F1:", grid.best_score_)


Best alpha: {'alpha': 10} Best F1: 0.574989667732081


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret 