In [4]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


### Setup/config


In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold

csv_path = "../online_shoppers_intention_data.csv"  # only used if df not already defined
target_col = "Revenue"   # set None if no target
var_thresh = 0.0         # drop constant columns
corr_thresh = 0.90       # drop one of any pair with |r| >= this

### Load Data

In [6]:
if "df" not in globals():
    df = pd.read_csv(csv_path)
print(df.shape)
df.head()

(12330, 18)


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


### Prep (booleans -> int; pick numeric features excluding target)


In [7]:
work = df.copy()

for c in work.columns:
    if work[c].dtype == bool:
        work[c] = work[c].astype(int)

num_cols = work.select_dtypes(include=[np.number]).columns.tolist()
if target_col and target_col in num_cols:
    num_cols.remove(target_col)
len(num_cols), num_cols[:10]

(15,
 ['Administrative',
  'Administrative_Duration',
  'Informational',
  'Informational_Duration',
  'ProductRelated',
  'ProductRelated_Duration',
  'BounceRates',
  'ExitRates',
  'PageValues',
  'SpecialDay'])

### Variance filter

In [8]:
vt = VarianceThreshold(threshold=var_thresh)
vt.fit(work[num_cols])

keep_by_var = [c for c, keep in zip(num_cols, vt.get_support()) if keep]
drop_by_var = sorted(set(num_cols) - set(keep_by_var))

work.drop(columns=drop_by_var, inplace=True, errors="ignore")

num_cols_after_var = work.select_dtypes(include=[np.number]).columns.tolist()
if target_col and target_col in num_cols_after_var:
    num_cols_after_var.remove(target_col)

len(drop_by_var), drop_by_var

(0, [])

### Correlation filter

In [None]:
def cols_to_drop_by_corr(frame, cols, thr=0.90):
    corr = frame[cols].corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop = set()
    for col in upper.columns:
        if any(upper[col] >= thr):
            to_drop.add(col)
    return sorted(to_drop)

drop_by_corr = cols_to_drop_by_corr(work, num_cols_after_var, thr=corr_thresh)
work.drop(columns=drop_by_corr, inplace=True, errors="ignore")

df_filtered = work  # <- use this downstream
len(drop_by_corr), drop_by_corr[:10]

(1, ['ExitRates'])

## Summary

In [10]:
print("=== Filter Summary ===")
print(f"Initial numeric (excl. target): {len(num_cols)}")
print(f"Dropped by variance (≤ {var_thresh}): {len(drop_by_var)} -> {drop_by_var}")
print(f"Remaining after variance: {len(num_cols_after_var)}")
print(f"Dropped by correlation (|r| ≥ {corr_thresh}): {len(drop_by_corr)} -> {drop_by_corr}")
print(f"Final columns: {len(df_filtered.columns)} (includes non-numeric + target if present)")
if target_col and target_col in df.columns:
    print(f"Target preserved: {target_col}")

df_filtered.head()

=== Filter Summary ===
Initial numeric (excl. target): 15
Dropped by variance (≤ 0.0): 0 -> []
Remaining after variance: 15
Dropped by correlation (|r| ≥ 0.9): 1 -> ['ExitRates']
Final columns: 17 (includes non-numeric + target if present)
Target preserved: Revenue


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,0,0
1,0,0.0,0,0.0,2,64.0,0.0,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,0,0
2,0,0.0,0,0.0,1,0.0,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,0,0
3,0,0.0,0,0.0,2,2.666667,0.05,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,0,0
4,0,0.0,0,0.0,10,627.5,0.02,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,1,0
