# 02. Data Preprocessing

Notebook này thực hiện tiền xử lý dữ liệu:
- Xử lý missing values
- Xử lý outliers
- Encoding categorical variables
- Chia tập train/validation/test
- Lưu dữ liệu đã xử lý

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Import project modules
import sys
from pathlib import Path

# Calculate project root by finding directory with config folder
current_path = Path().resolve()
project_root = current_path

# Find project root by looking for config/ directory
max_levels = 5
for _ in range(max_levels):
    if (project_root / 'config').exists() and (project_root / 'src').exists():
        break
    if project_root.parent == project_root:
        break
    project_root = project_root.parent
else:
    if 'notebooks' in str(current_path):
        project_root = current_path.parent

# Add src to Python path
src_path = project_root / 'src'
if src_path.exists():
    sys.path.insert(0, str(src_path))

from data.load_data import load_raw_data
from data.preprocess import (
    check_missing_values, 
    check_outliers, 
    encode_categorical,
    split_data
)

## 1. Load Raw Data

In [2]:
# Load data
df = load_raw_data()
print(f"Initial shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
df.head()

Initial shape: (12330, 18)

Columns: ['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend', 'Revenue']


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


## 2. Check Missing Values

In [3]:
# Check for missing values
missing_df = check_missing_values(df)
print("Missing Values Summary:")
print(missing_df[missing_df['Missing Count'] > 0])

# If no missing values, dataset is clean
if missing_df['Missing Count'].sum() == 0:
    print("\n✓ No missing values found!")
else:
    print("\nNeed to handle missing values...")

Missing Values Summary:
Empty DataFrame
Columns: [Missing Count, Missing Percentage]
Index: []

✓ No missing values found!


## 3. Outlier Detection

In [4]:
# Check outliers
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
outlier_summary = check_outliers(df, columns=numerical_cols)

print("Outlier Summary:")
print(outlier_summary)
print("\nNote: Outliers are kept for now as they may contain valuable information.")

Outlier Summary:
                     Column  Outlier Count  Percentage
0            Administrative            404    3.276561
1   Administrative_Duration           1172    9.505272
2             Informational           2631   21.338200
3    Informational_Duration           2405   19.505272
4            ProductRelated            987    8.004866
5   ProductRelated_Duration            961    7.793998
6               BounceRates           1551   12.579075
7                 ExitRates           1099    8.913220
8                PageValues           2730   22.141119
9                SpecialDay           1251   10.145985
10         OperatingSystems            111    0.900243
11                  Browser           4369   35.433901
12                   Region            511    4.144363
13              TrafficType           2101   17.039740

Note: Outliers are kept for now as they may contain valuable information.


## 4. Encode Categorical Variables

In [5]:
# Identify categorical columns (excluding target)
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns.tolist()
if 'Revenue' in categorical_cols:
    categorical_cols.remove('Revenue')

print(f"Categorical columns to encode: {categorical_cols}")

# Encode categorical variables using LabelEncoder
df_encoded, label_encoders = encode_categorical(df, columns=categorical_cols)

print("\nEncoded data shape:", df_encoded.shape)
print("\nEncoded data types:")
print(df_encoded.dtypes)

Categorical columns to encode: ['Month', 'VisitorType', 'Weekend']

Encoded data shape: (12330, 18)

Encoded data types:
Administrative               int64
Administrative_Duration    float64
Informational                int64
Informational_Duration     float64
ProductRelated               int64
ProductRelated_Duration    float64
BounceRates                float64
ExitRates                  float64
PageValues                 float64
SpecialDay                 float64
Month                        int64
OperatingSystems             int64
Browser                      int64
Region                       int64
TrafficType                  int64
VisitorType                  int64
Weekend                      int64
Revenue                       bool
dtype: object


## 5. Encode Target Variable

In [6]:
# Encode Revenue (target variable)
if df_encoded['Revenue'].dtype == 'bool':
    df_encoded['Revenue'] = df_encoded['Revenue'].astype(int)
elif df_encoded['Revenue'].dtype == 'object':
    df_encoded['Revenue'] = (df_encoded['Revenue'] == 'True').astype(int)

print("Target variable distribution:")
print(df_encoded['Revenue'].value_counts())

Target variable distribution:
Revenue
0    10422
1     1908
Name: count, dtype: int64


## 6. Train/Validation/Test Split

In [7]:
# Split data
X_train, X_val, X_test, y_train, y_val, y_test = split_data(
    df_encoded,
    target_col='Revenue',
    test_size=0.2,
    val_size=0.1,
    random_state=42,
    stratify=True
)

print("Data Split Summary:")
print(f"Training set:   {X_train.shape[0]} samples ({X_train.shape[0]/len(df_encoded)*100:.1f}%)")
print(f"Validation set: {X_val.shape[0]} samples ({X_val.shape[0]/len(df_encoded)*100:.1f}%)")
print(f"Test set:       {X_test.shape[0]} samples ({X_test.shape[0]/len(df_encoded)*100:.1f}%)")

Data Split Summary:
Training set:   8631 samples (70.0%)
Validation set: 1233 samples (10.0%)
Test set:       2466 samples (20.0%)


## 7. Save Processed Data

In [8]:
# Save processed datasets
processed_dir = project_root / "data" / "processed"
processed_dir.mkdir(parents=True, exist_ok=True)

# Save as CSV
X_train.to_csv(processed_dir / "X_train.csv", index=False)
X_val.to_csv(processed_dir / "X_val.csv", index=False)
X_test.to_csv(processed_dir / "X_test.csv", index=False)

y_train.to_csv(processed_dir / "y_train.csv", index=False)
y_val.to_csv(processed_dir / "y_val.csv", index=False)
y_test.to_csv(processed_dir / "y_test.csv", index=False)

df_encoded.to_csv(processed_dir / "df_encoded.csv", index=False)

print("✓ Processed data saved successfully!")

✓ Processed data saved successfully!
