In [6]:
# Modeling Preprocessing Pipeline (no clipping outliers)

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import joblib

DATA_DIR = Path('/Users/doananh/Documents/đồ án DS/')
CLEAN_FILE = DATA_DIR / 'data_motobikes_clean.csv'
ARTIFACT_DIR = DATA_DIR / 'artifacts'
ARTIFACT_DIR.mkdir(exist_ok=True)

# Load cleaned data
Df = pd.read_csv(CLEAN_FILE, low_memory=False)
print('Loaded cleaned shape:', Df.shape)

# Target
TARGET = 'gia_vnd_final'
assert TARGET in Df.columns, f'Missing target {TARGET}. Run preprocessing first.'

# Feature sets (expandable later)
NUMERIC_FEATURES = [
    c for c in ['so_km','nam_dang_ky','dung_tich_cc','trong_luong_kg','len_title','len_desc'] if c in Df.columns
]
CATEGORICAL_FEATURES = [
    c for c in ['thuong_hieu','dong_xe','tinh_trang','loai_xe','xuat_xu','tinh_thanh','quan'] if c in Df.columns
]

print('Numeric:', NUMERIC_FEATURES)
print('Categorical:', CATEGORICAL_FEATURES)

X = Df[NUMERIC_FEATURES + CATEGORICAL_FEATURES].copy()
y = Df[TARGET].copy()

# ColumnTransformer: numeric median; categorical most_frequent + OHE(ignore unknown)
numeric_tf = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
])

# OneHotEncoder: use sparse_output for sklearn >= 1.2, or sparse for older versions
import sklearn
sklearn_version = sklearn.__version__
if sklearn_version >= '1.2.0':
    ohe_params = {'handle_unknown': 'ignore', 'sparse_output': True}
else:
    ohe_params = {'handle_unknown': 'ignore', 'sparse': True}

categorical_tf = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(**ohe_params))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_tf, NUMERIC_FEATURES),
        ('cat', categorical_tf, CATEGORICAL_FEATURES),
    ],
    remainder='drop'
)

# Fit preprocessor (do not clip outliers here)
preprocessor.fit(X)

# Persist artifact
artifact_path = ARTIFACT_DIR / 'preprocessor.joblib'
joblib.dump({'preprocessor': preprocessor,
             'numeric_features': NUMERIC_FEATURES,
             'categorical_features': CATEGORICAL_FEATURES,
             'target': TARGET}, artifact_path)
print('Saved preprocessor to:', artifact_path)

# Sanity: transform a sample to check shape
Xt = preprocessor.transform(X.iloc[:100])
print('Sample transformed shape:', Xt.shape)


Loaded cleaned shape: (7208, 30)
Numeric: ['so_km', 'nam_dang_ky', 'dung_tich_cc', 'trong_luong_kg', 'len_title', 'len_desc']
Categorical: ['thuong_hieu', 'dong_xe', 'tinh_trang', 'loai_xe', 'xuat_xu', 'tinh_thanh', 'quan']
Saved preprocessor to: /Users/doananh/Documents/đồ án DS/artifacts/preprocessor.joblib
Sample transformed shape: (100, 278)
