# DS3000: Feature Engineering + Encoding & Scaling (Combined Notebook)

This notebook combines your two existing feature-engineering notebooks and adds a unified preprocessing step for **one-hot encoding** categorical variables and **standardizing** numerical features.

**Sections:**
1. Creating Derived Features (from your original notebook)
2. Feature Selection (from your original notebook)
3. Encoding (One-Hot for `Browser`, `Month`, `VisitorType`) and Standardization (StandardScaler)


## 1) Creating Derived Features (from your original notebook)

In [2]:
import pandas as pd

df = pd.read_csv('../online_shoppers_intention_data.csv')

In [3]:
# Create 'Total_Time' by summing duration-related features
df['Total_Time'] = df['Administrative_Duration'] + df['Informational_Duration'] + df['ProductRelated_Duration']

In [4]:
# Create 'Is_SpecialDay' by converting 'SpecialDay' to a binary indicator (1 if SpecialDay > 0, else 0)
df['Is_SpecialDay'] = (df['SpecialDay'] > 0).astype(int)

In [5]:
# Check dataframe with the added features
print(f'New shape of the dataset: {df.shape}')

# Display basic info for new features
print('\nDatatypes and non-null counts for new features:')
display(df[['Total_Time', 'Is_SpecialDay']].info())

New shape of the dataset: (12330, 20)

Datatypes and non-null counts for new features:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Total_Time     12330 non-null  float64
 1   Is_SpecialDay  12330 non-null  int64  
dtypes: float64(1), int64(1)
memory usage: 192.8 KB


None

## 2) Feature Selection (from your original notebook)

In [6]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


### Setup/config


In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold

csv_path = "../online_shoppers_intention_data.csv"  # only used if df not already defined
target_col = "Revenue"   # set None if no target
var_thresh = 0.0         # drop constant columns
corr_thresh = 0.90       # drop one of any pair with |r| >= this

### Load Data

In [8]:
if "df" not in globals():
    df = pd.read_csv(csv_path)
print(df.shape)
df.head()

(12330, 20)


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue,Total_Time,Is_SpecialDay
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False,0.0,0
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False,64.0,0
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False,0.0,0
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False,2.666667,0
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False,627.5,0


### Prep (booleans -> int; pick numeric features excluding target)


In [9]:
work = df.copy()

for c in work.columns:
    if work[c].dtype == bool:
        work[c] = work[c].astype(int)

num_cols = work.select_dtypes(include=[np.number]).columns.tolist()
if target_col and target_col in num_cols:
    num_cols.remove(target_col)
len(num_cols), num_cols[:10]

(17,
 ['Administrative',
  'Administrative_Duration',
  'Informational',
  'Informational_Duration',
  'ProductRelated',
  'ProductRelated_Duration',
  'BounceRates',
  'ExitRates',
  'PageValues',
  'SpecialDay'])

### Variance filter

In [10]:
vt = VarianceThreshold(threshold=var_thresh)
vt.fit(work[num_cols])

keep_by_var = [c for c, keep in zip(num_cols, vt.get_support()) if keep]
drop_by_var = sorted(set(num_cols) - set(keep_by_var))

work.drop(columns=drop_by_var, inplace=True, errors="ignore")

num_cols_after_var = work.select_dtypes(include=[np.number]).columns.tolist()
if target_col and target_col in num_cols_after_var:
    num_cols_after_var.remove(target_col)

len(drop_by_var), drop_by_var

(0, [])

### Correlation filter

In [11]:
def cols_to_drop_by_corr(frame, cols, thr=0.90):
    corr = frame[cols].corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop = set()
    for col in upper.columns:
        if any(upper[col] >= thr):
            to_drop.add(col)
    return sorted(to_drop)

drop_by_corr = cols_to_drop_by_corr(work, num_cols_after_var, thr=corr_thresh)
work.drop(columns=drop_by_corr, inplace=True, errors="ignore")

df_filtered = work  # <- use this downstream
len(drop_by_corr), drop_by_corr[:10]

(3, ['ExitRates', 'Is_SpecialDay', 'Total_Time'])

## Summary

In [12]:
print("=== Filter Summary ===")
print(f"Initial numeric (excl. target): {len(num_cols)}")
print(f"Dropped by variance (≤ {var_thresh}): {len(drop_by_var)} -> {drop_by_var}")
print(f"Remaining after variance: {len(num_cols_after_var)}")
print(f"Dropped by correlation (|r| ≥ {corr_thresh}): {len(drop_by_corr)} -> {drop_by_corr}")
print(f"Final columns: {len(df_filtered.columns)} (includes non-numeric + target if present)")
if target_col and target_col in df.columns:
    print(f"Target preserved: {target_col}")

df_filtered.head()

=== Filter Summary ===
Initial numeric (excl. target): 17
Dropped by variance (≤ 0.0): 0 -> []
Remaining after variance: 17
Dropped by correlation (|r| ≥ 0.9): 3 -> ['ExitRates', 'Is_SpecialDay', 'Total_Time']
Final columns: 17 (includes non-numeric + target if present)
Target preserved: Revenue


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,0,0
1,0,0.0,0,0.0,2,64.0,0.0,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,0,0
2,0,0.0,0,0.0,1,0.0,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,0,0
3,0,0.0,0,0.0,2,2.666667,0.05,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,0,0
4,0,0.0,0,0.0,10,627.5,0.02,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,1,0


## 3) Encoding & Scaling (One-Hot + StandardScaler)

In [13]:
# If needed, install scikit-learn in your environment (uncomment when running in a fresh kernel)
# %pip install -q scikit-learn

import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# --- CONFIG ---
csv_path = "/mnt/data/online_shoppers_intention_data.csv"   # fallback if df is not already defined
target_col = "Revenue"                                      # adjust if your target is named differently
categorical_cols = ["Browser", "Month", "VisitorType"]      # one-hot encode these

# --- Ensure df is present ---
# If previous sections already created a 'df' DataFrame, we'll reuse it.
# Otherwise, load from CSV.
if 'df' not in globals():
    df = pd.read_csv(csv_path)

# Make sure target column exists
if target_col not in df.columns:
    raise ValueError(f"Target column '{target_col}' not found in DataFrame columns: {df.columns.tolist()}")

# Split features/target
X = df.drop(columns=[target_col])
y = df[target_col].copy()

# Identify numeric columns AFTER your feature engineering + selection steps
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# Intersect categorical columns with actual X columns (ignore any that might have been dropped earlier)
cat_cols = [c for c in categorical_cols if c in X.columns]

# Safety: remove any overlap between num_cols and cat_cols (in case types changed)
num_cols = [c for c in num_cols if c not in cat_cols]

# Build preprocessing pipeline
# Using a compatibility helper for OneHotEncoder's sparse/sparse_output arg
def make_ohe():
    try:
        return OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore")
    except TypeError:
        # For older scikit-learn versions where 'sparse_output' doesn't exist
        return OneHotEncoder(drop="first", sparse=False, handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", make_ohe(), cat_cols),
    ],
    remainder="drop",  # drop anything not in num_cols or cat_cols
)

# Fit/transform
X_processed = preprocessor.fit_transform(X)

# Build output DataFrame
# Get names for numeric + encoded categorical features
num_feature_names = num_cols

# Handle OHE feature names across sklearn versions
try:
    encoded_cols = preprocessor.named_transformers_["cat"].get_feature_names_out(cat_cols).tolist()
except AttributeError:
    # Fallback for very old versions
    enc = preprocessor.named_transformers_["cat"]
    encoded_cols = []
    for i, base in enumerate(cat_cols):
        cats = enc.categories_[i][1:]  # dropped='first' -> skip first
        encoded_cols += [f"{base}_{c}" for c in cats]

all_cols = num_feature_names + encoded_cols

import pandas as pd
X_processed_df = pd.DataFrame(X_processed, columns=all_cols).reset_index(drop=True)

# Re-attach target
final_df = pd.concat([X_processed_df, y.reset_index(drop=True)], axis=1)

print("Numeric columns scaled:", num_cols)
print("Categorical columns one-hot encoded (drop='first'):", cat_cols)
print("Final shape:", final_df.shape)

final_df.head()


Numeric columns scaled: ['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'OperatingSystems', 'Region', 'TrafficType', 'Total_Time', 'Is_SpecialDay']
Categorical columns one-hot encoded (drop='first'): ['Browser', 'Month', 'VisitorType']
Final shape: (12330, 39)


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,...,Month_Jul,Month_June,Month_Mar,Month_May,Month_Nov,Month_Oct,Month_Sep,VisitorType_Other,VisitorType_Returning_Visitor,Revenue
0,-0.696993,-0.457191,-0.396478,-0.244931,-0.691003,-0.624348,3.667189,3.229316,-0.317178,-0.308821,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,False
1,-0.696993,-0.457191,-0.396478,-0.244931,-0.668518,-0.590903,-0.457683,1.171473,-0.317178,-0.308821,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,False
2,-0.696993,-0.457191,-0.396478,-0.244931,-0.691003,-0.624348,3.667189,3.229316,-0.317178,-0.308821,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,False
3,-0.696993,-0.457191,-0.396478,-0.244931,-0.668518,-0.622954,0.573535,1.99461,-0.317178,-0.308821,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,False
4,-0.696993,-0.457191,-0.396478,-0.244931,-0.488636,-0.29643,-0.045196,0.142551,-0.317178,-0.308821,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,False


### (Optional) Save processed dataset

In [14]:
# Save to CSV for downstream modeling
out_csv = "../processed_online_shoppers_data.csv"
final_df.to_csv(out_csv, index=False)
print(f"Processed dataset saved to: {out_csv}")


Processed dataset saved to: ../processed_online_shoppers_data.csv
