In [1]:
# =========================================================
# 1. Imports
# =========================================================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_classif

from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt

In [2]:
# =========================================================
# 2. Load dataset
# =========================================================
df = pd.read_csv("data/online_shoppers_intention.csv")
print("Dataset loaded:", df.shape)

Dataset loaded: (12330, 18)


In [3]:
# =========================================================
# 3. Convert boolean variables to numeric
# =========================================================
df['Revenue'] = df['Revenue'].astype(int)
df['Weekend'] = df['Weekend'].astype(int)


# =========================================================
# 4. Group rare categories in numeric-categorical features
# =========================================================
numeric_categorical = ['OperatingSystems', 'Browser', 'Region', 'TrafficType']

def group_rare(series, threshold=50):
    freq = series.value_counts()
    return series.apply(lambda x: x if freq[x] > threshold else "Other")

for col in numeric_categorical:
    df[col] = df[col].astype(str)
    df[col] = group_rare(df[col])


# =========================================================
# 5. Define categorical & numerical sets
# =========================================================
categorical_nominal = ['Month', 'VisitorType'] + numeric_categorical
numerical_features_original = [
    'Administrative', 'Administrative_Duration',
    'Informational', 'Informational_Duration',
    'ProductRelated', 'ProductRelated_Duration',
    'BounceRates', 'ExitRates', 'PageValues',
    'SpecialDay'
]


# =========================================================
# 6. Train-test split (BEFORE encoding!)
# =========================================================
X = df.drop('Revenue', axis=1)
y = df['Revenue']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train:", X_train.shape, " Test:", X_test.shape)


# =========================================================
# 7. Log-transform duration columns
# =========================================================
for col in ['Administrative_Duration', 'Informational_Duration', 'ProductRelated_Duration']:
    X_train[col] = np.log1p(X_train[col])
    X_test[col] = np.log1p(X_test[col])


# =========================================================
# 8. Fit OneHotEncoder on TRAIN split
# =========================================================
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
encoder.fit(X_train[categorical_nominal])

# Transform
X_train_cat = encoder.transform(X_train[categorical_nominal])
X_test_cat  = encoder.transform(X_test[categorical_nominal])

X_train = X_train.drop(columns=categorical_nominal)
X_test = X_test.drop(columns=categorical_nominal)

# Get feature names (optional)
encoded_cols = encoder.get_feature_names_out(categorical_nominal)


# =========================================================
# Define numerical columns AUTOMATICALLY
# =========================================================
numerical_features = X_train.select_dtypes(include=[np.number]).columns

X_train_num = X_train[numerical_features].reset_index(drop=True)
X_test_num  = X_test[numerical_features].reset_index(drop=True)


X_train_full = np.hstack([X_train_num.values, X_train_cat])
X_test_full  = np.hstack([X_test_num.values,  X_test_cat])



# =========================================================
# 10. Scale numerical + encoded features
# =========================================================
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_full)
X_test_scaled  = scaler.transform(X_test_full)


# =========================================================
# 11. PCA (for visualization only)
# =========================================================
pca = PCA(n_components=2)
pca.fit(X_train_scaled)

X_train_pca = pca.transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# =========================================================
# 12. SMOTE (APPLY ONLY ON SCALED DATA, ONLY FOR TRAINING MODELS THAT NEED IT)
# =========================================================
# Example: only for tree models, not for LogisticRegression

smote = SMOTE(random_state=0)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

print("After SMOTE:", X_train_smote.shape)


Train: (9864, 17)  Test: (2466, 17)
After SMOTE: (16676, 55)


  C = X.T @ X
  C = X.T @ X
  C = X.T @ X
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T


# Model Training

In [5]:
import numpy as np

class RegularizedDiscriminantAnalysis:
    """
    Implementation of Regularized Discriminant Analysis (RDA)
    as described in Friedman (1989).
    
    Parameters
    ----------
    lambda_param : float, default=0.0
        Controls blending between class-specific covariance (0)
        and pooled covariance (1).

    gamma_param : float, default=0.0
        Controls shrinkage of covariance toward spherical matrix.

    Attributes
    ----------
    classes_ : array-like
        Unique class labels.
    priors_ : array-like
        Class prior probabilities.
    cov_ : dict
        Regularized covariance matrix per class.
    means_ : dict
        Class means.
    """

    def __init__(self, lambda_param=0.0, gamma_param=0.0):
        self.lambda_param = lambda_param
        self.gamma_param = gamma_param

    def fit(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y)

        self.classes_ = np.unique(y)
        n_classes = len(self.classes_)

        n_samples, n_features = X.shape

        # Compute class means, covariances
        self.means_ = {}
        covariances = {}
        priors = {}

        for c in self.classes_:
            X_c = X[y == c]
            priors[c] = X_c.shape[0] / n_samples
            self.means_[c] = X_c.mean(axis=0)
            
            # Sample covariance for that class
            covariances[c] = np.cov(X_c, rowvar=False)

        self.priors_ = priors

        # Compute pooled covariance
        pooled_cov = np.zeros((n_features, n_features))
        for c in self.classes_:
            n_c = (y == c).sum()
            pooled_cov += (n_c - 1) * covariances[c]
        pooled_cov /= (n_samples - n_classes)

        # Build regularized covariances (Friedman's RDA)
        self.cov_ = {}

        for c in self.classes_:
            cov_k = covariances[c]

            # Step 1: blend class-specific vs pooled
            cov_lambda = (
                (1 - self.lambda_param) * cov_k
                + self.lambda_param * pooled_cov
            )

            # Step 2: shrink toward spherical covariance
            trace_val = np.trace(cov_lambda) / n_features
            cov_gamma = (
                (1 - self.gamma_param) * cov_lambda
                + self.gamma_param * trace_val * np.eye(n_features)
            )

            self.cov_[c] = cov_gamma

        return self

    def _discriminant(self, X, mean, cov, prior):
        """Compute discriminant function for Gaussian model."""
        inv_cov = np.linalg.inv(cov)
        det_cov = np.linalg.det(cov)
        diff = X - mean

        # Quadratic discriminant function
        term1 = -0.5 * np.sum(diff @ inv_cov * diff, axis=1)
        term2 = -0.5 * np.log(det_cov)
        term3 = np.log(prior)

        return term1 + term2 + term3

    def predict(self, X):
        X = np.asarray(X)
        scores = []

        for c in self.classes_:
            s = self._discriminant(
                X,
                self.means_[c],
                self.cov_[c],
                self.priors_[c]
            )
            scores.append(s)

        scores = np.vstack(scores).T
        idx = np.argmax(scores, axis=1)
        return self.classes_[idx]

    def predict_proba(self, X):
        X = np.asarray(X)
        scores = []

        for c in self.classes_:
            s = self._discriminant(
                X,
                self.means_[c],
                self.cov_[c],
                self.priors_[c]
            )
            scores.append(s)

        scores = np.vstack(scores).T
        
        # Convert log-scores to probabilities (softmax)
        exp_scores = np.exp(scores - scores.max(axis=1, keepdims=True))
        return exp_scores / exp_scores.sum(axis=1, keepdims=True)


In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import BernoulliNB
from imblearn.pipeline import Pipeline
from sklearn.linear_model import Perceptron

models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced',max_iter=1000),
    "LDA": LinearDiscriminantAnalysis(priors=[0.5, 0.5]),
    "QDA": QuadraticDiscriminantAnalysis(priors=[0.5, 0.5]),
    "Naive Bayes": BernoulliNB(alpha=10),
    "Perceptron" : Perceptron(alpha=1e-06,eta0=0.01,max_iter=1000)
    #"RDA": RegularizedDiscriminantAnalysis(lambda_param=0.5, gamma_param=0.5)
}

for name, model in models.items():
    if name not in ['Naive Bayes','Perceptron']:
        scores = cross_val_score(model, X_train_scaled, y_train, cv=10, scoring='f1')
    if name == 'Naive Bayes':
        pipeline = Pipeline([
                            ('smote', SMOTE(random_state=0)),
                            ('nb', model)
                        ])
        scores = cross_val_score(pipeline, X_train_scaled, y_train, cv=10, scoring='f1')
    if name == 'Perceptron':
        pipeline = Pipeline([
                            ('smote', SMOTE(random_state=0)),
                            ('clf', model)
                        ])
        scores = cross_val_score(pipeline, X_train_scaled, y_train, cv=10, scoring='f1')
    print(f"{name}: mean F1 = {scores.mean():.3f} ± {scores.std():.3f}")


  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad

Logistic Regression: mean F1 = 0.613 ± 0.017
LDA: mean F1 = 0.627 ± 0.036


  self.scalings_ = scalings @ Vt.T[:, :rank]
  self.scalings_ = scalings @ Vt.T[:, :rank]
  self.scalings_ = scalings @ Vt.T[:, :rank]
  ret = a @ b
  ret = a @ b
  ret = a @ b
  self.scalings_ = scalings @ Vt.T[:, :rank]
  self.scalings_ = scalings @ Vt.T[:, :rank]
  self.scalings_ = scalings @ Vt.T[:, :rank]
  ret = a @ b
  ret = a @ b
  ret = a @ b


QDA: mean F1 = 0.385 ± 0.018
Naive Bayes: mean F1 = 0.518 ± 0.018


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret 

Perceptron: mean F1 = 0.494 ± 0.029


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


In [13]:
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix
)

results = []

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    acc = accuracy_score(y_test, y_pred)
    f1_macro = f1_score(y_test, y_pred, average='macro')
    f1_pos1 = f1_score(y_test, y_pred, pos_label=1)
    
    results.append({
        "Model": name,
        "Accuracy": acc,
        "F1_macro": f1_macro,
        "F1_class1": f1_pos1
    })

# --- Create comparison table ---
results_df = pd.DataFrame(results).sort_values(by="F1_class1", ascending=False)
print("\nModel comparison on test set:")
print(results_df.to_string(index=False))


Model comparison on test set:
              Model  Accuracy  F1_macro  F1_class1
                LDA  0.882401  0.774939   0.619423
Logistic Regression  0.837388  0.742493   0.586171
        Naive Bayes  0.834144  0.715879   0.532571
         Perceptron  0.843471  0.686844   0.465374
                QDA  0.551095  0.507589   0.361223


  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b
  self.scalings_ = scalings @ Vt.T[:, :rank]
  self.scalings_ = scalings @ Vt.T[:, :rank]
  self.scalings_ = scalings @ Vt.T[:, :rank]
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
