In [None]:
# Cell 1: Imports
import sys
import os
import json
import joblib
import pandas as pd
sys.path.append(os.path.abspath(os.path.join('..')))
from src.classification_library import load_data, clean_data
from src.regression_library import create_lag_features, split_train_test, train_model, evaluate_model

# Cell 2: Parameters
# tags=["parameters"]
RAW_ZIP_PATH = '../data/raw/PRSA2017_Data_20130301-20170228.zip'
CUTOFF = '2017-01-01'
LAG_HOURS = [1, 3, 24]
HORIZON = 1
TARGET_COL = 'PM2.5'
OUTPUT_REG_DATASET_PATH = '../data/processed/dataset_for_regression.parquet'
METRICS_OUT = '../data/processed/regression_metrics.json'

# Cell 3: Main Execution
print("--- BẮT ĐẦU BƯỚC 2: REGRESSION ---")

# 1. Load & Clean lại (để đảm bảo tính nhất quán)
df = clean_data(load_data(RAW_ZIP_PATH))

# 2. Chọn trạm Aotizhongxin để demo (hoặc chạy full nếu muốn)
station_name = 'Aotizhongxin'
print(f"Đang xử lý trạm: {station_name}")
df_station = df[df['station'] == station_name].copy()

# 3. Tạo Lag Features
df_reg, features, target_name = create_lag_features(df_station, TARGET_COL, LAG_HOURS, HORIZON)
print(f"Features sử dụng: {features}")

# 4. Chia Train/Test
train_df, test_df = split_train_test(df_reg, CUTOFF)
print(f"Train size: {train_df.shape}, Test size: {test_df.shape}")

# 5. Train Model
print("Đang huấn luyện mô hình...")
model = train_model(train_df, features, target_name)

# 6. Evaluate
metrics, preds = evaluate_model(model, test_df, features, target_name)
print("Kết quả đánh giá:", metrics)

# 7. Lưu Metrics
with open(METRICS_OUT, 'w') as f:
    json.dump(metrics, f)
print(f"Đã lưu metrics tại: {METRICS_OUT}")