# 03. Feature Engineering

Notebook này tạo các đặc trưng mới để cải thiện hiệu suất mô hình:
- Tổng hợp features (total duration, total pages)
- Tỷ lệ features (duration ratios, page ratios)
- Duration per page features
- Seasonal features (quarter, Q4 indicator)
- Interaction features

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Import project modules
import sys
from pathlib import Path

# Calculate project root by finding directory with config folder
current_path = Path().resolve()
project_root = current_path

# Find project root by looking for config/ directory
max_levels = 5
for _ in range(max_levels):
    if (project_root / 'config').exists() and (project_root / 'src').exists():
        break
    if project_root.parent == project_root:
        break
    project_root = project_root.parent
else:
    if 'notebooks' in str(current_path):
        project_root = current_path.parent

# Add src to Python path
src_path = project_root / 'src'
if src_path.exists():
    sys.path.insert(0, str(src_path))

from features.engineering import engineer_all_features

## 1. Load Data

In [2]:
# Load preprocessed data
processed_dir = project_root / "data" / "processed"
df = pd.read_csv(processed_dir / "df_encoded.csv")

print(f"Data shape: {df.shape}")
df.head()

Data shape: (12330, 18)


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2,1,1,1,1,2,0,0
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,2,2,2,1,2,2,0,0
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2,4,1,9,3,2,0,0
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,2,3,2,2,4,2,0,0
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,2,3,3,1,4,2,1,0


## 2. Apply Feature Engineering

In [3]:
# Apply all feature engineering
df_engineered = engineer_all_features(df)

print(f"Original features: {len(df.columns)}")
print(f"Engineered features: {len(df_engineered.columns)}")
print(f"New features added: {len(df_engineered.columns) - len(df.columns)}")

# Show new features
new_features = [col for col in df_engineered.columns if col not in df.columns]
print(f"\nNew features: {new_features}")

Original features: 18
Engineered features: 35
New features added: 17

New features: ['total_duration', 'total_pages', 'admin_duration_ratio', 'informational_duration_ratio', 'product_duration_ratio', 'admin_pages_ratio', 'informational_pages_ratio', 'product_pages_ratio', 'admin_duration_per_page', 'informational_duration_per_page', 'product_duration_per_page', 'avg_duration_per_page', 'is_q4', 'quarter', 'PageValues_x_ProductRelated', 'SpecialDay_x_PageValues', 'BounceRates_x_ExitRates']


## 3. Save Engineered Features

In [4]:
# Save engineered dataset
features_dir = project_root / "data" / "features"
features_dir.mkdir(parents=True, exist_ok=True)

df_engineered.to_csv(features_dir / "df_engineered.csv", index=False)

print("Engineered features saved successfully!")

Engineered features saved successfully!
