In [15]:
import math
import copy
import random
import numpy as np
import pandas as pd
from dataclasses import dataclass

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt

from pathlib import Path
import pandas as pd
import os
import tempfile

In [16]:
CSV_PATH   = "../data/CLEAN_BIGGEST_CHUNK-weather-PM2.5-data/weather-PM2.5-05T.csv"  # <- แก้ path ได้
DATE_COL   = "Date"
TARGET_COL = "PM2.5"

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
# Feature engineering
LAGS = [1, 2, 3, 7, 14, 30]
ROLL_WINDOWS = [3, 7, 14]

In [17]:
def load_and_clean(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    df.columns = [c.strip() for c in df.columns]

    # Parse date (ของคุณเป็น m/d/YYYY ในตัวอย่าง)
    df[DATE_COL] = pd.to_datetime(df[DATE_COL], format="%m/%d/%Y", errors="coerce")

    # Convert numeric cols
    for c in df.columns:
        if c != DATE_COL:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    df = df.dropna(subset=[DATE_COL])
    df = df.sort_values(DATE_COL).drop_duplicates(subset=[DATE_COL]).reset_index(drop=True)
    return df

def add_time_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["year"] = df[DATE_COL].dt.year
    df["month"] = df[DATE_COL].dt.month
    df["dayofweek"] = df[DATE_COL].dt.dayofweek
    df["dayofyear"] = df[DATE_COL].dt.dayofyear
    return df


def add_lag_and_rolling(df: pd.DataFrame, target_col: str) -> pd.DataFrame:
    df = df.copy()

    # lag features (past only)
    for lag in LAGS:
        df[f"{target_col}_lag{lag}"] = df[target_col].shift(lag)

    # rolling features (past only) -> shift(1) before rolling
    for w in ROLL_WINDOWS:
        s = df[target_col].shift(1)
        df[f"{target_col}_rollmean{w}"] = s.rolling(window=w, min_periods=1).mean()
        df[f"{target_col}_rollstd{w}"]  = s.rolling(window=w, min_periods=2).std()

    return df

In [18]:
def add_wind_dir_sincos_deg(
    df: pd.DataFrame,
    wind_dir_col: str,
    drop_original: bool = True
) -> pd.DataFrame:
    df = df.copy()

    # แปลงเป็นตัวเลขก่อน (เผื่อมี string)
    wd = pd.to_numeric(df[wind_dir_col], errors="coerce")

    # ทำให้อยู่ในช่วง 0..360 (กันค่าหลุด เช่น -10, 370)
    wd = wd % 360

    rad = np.deg2rad(wd)  # degrees -> radians

    df[f"{wind_dir_col}_sin"] = np.sin(rad)
    df[f"{wind_dir_col}_cos"] = np.cos(rad)

    if drop_original and wind_dir_col in df.columns:
        df = df.drop(columns=[wind_dir_col])

    return df


def add_time_features(
    df: pd.DataFrame,
    drop_original: bool = True
) -> pd.DataFrame:
    df = df.copy()

    df["year"] = df[DATE_COL].dt.year

    # month: 1..12 -> 0..11
    m = df[DATE_COL].dt.month - 1
    df["month_sin"] = np.sin(2*np.pi*m/12)
    df["month_cos"] = np.cos(2*np.pi*m/12)

    # dayofweek: 0..6
    dow = df[DATE_COL].dt.dayofweek
    df["dayofweek_sin"] = np.sin(2*np.pi*dow/7)
    df["dayofweek_cos"] = np.cos(2*np.pi*dow/7)

    # dayofyear: 1..365/366 -> ใช้จำนวนวันจริงของปีนั้น
    doy0 = df[DATE_COL].dt.dayofyear - 1
    days_in_year = df[DATE_COL].dt.is_leap_year.map({True: 366, False: 365}).astype(int)
    df["dayofyear_sin"] = np.sin(2*np.pi*doy0/days_in_year)
    df["dayofyear_cos"] = np.cos(2*np.pi*doy0/days_in_year)

    # ถ้าคุณเคยสร้าง month/dayofweek/dayofyear ไว้แล้วในขั้นก่อนหน้า -> ลบทิ้ง
    if drop_original:
        cols_to_drop = [c for c in ["month", "dayofweek", "dayofyear"] if c in df.columns]
        if cols_to_drop:
            df = df.drop(columns=cols_to_drop)

    return df

import pandas as pd

def add_precip_lag_and_rolling(
    df: pd.DataFrame,
    precip_col: str = "Prec.",
    lags=(1, 2, 3, 7, 14),
    roll_windows=(3, 7, 14),
    add_roll_sum: bool = True,
    add_roll_mean: bool = True,
) -> pd.DataFrame:
    df = df.copy()

    # ---- lag features (past only) ----
    for lag in lags:
        df[f"{precip_col}_lag{lag}"] = df[precip_col].shift(lag)

    # ---- rolling features (past only) ----
    # ใช้ฝนในอดีตเท่านั้น: shift(1) ก่อน rolling
    s = df[precip_col].shift(1)

    for w in roll_windows:
        if add_roll_sum:
            df[f"{precip_col}_rollsum{w}"] = s.rolling(window=w, min_periods=1).sum()
        if add_roll_mean:
            df[f"{precip_col}_rollmean{w}"] = s.rolling(window=w, min_periods=1).mean()

    return df


df = load_and_clean(CSV_PATH)
df = add_time_features(df, drop_original=True)

df = add_lag_and_rolling(df, target_col=TARGET_COL)
df = add_wind_dir_sincos_deg(df, wind_dir_col="Wind Dir", drop_original=True)

# เพิ่ม lag/rolling ของฝน
df = add_precip_lag_and_rolling(df, precip_col="Prec.", lags=(1,2,3,7,14), roll_windows=(3,7,14))


df.tail


<bound method NDFrame.tail of            Date  Wind Speed  Temp.  Humi.  heatidx   Pres.  Prec.   Vis.  \
0    2018-04-10         3.2   30.4   69.7     36.0  1009.3    0.0  16207   
1    2018-04-11         4.5   30.2   79.1     38.0  1008.5    0.0  17389   
2    2018-04-12         4.8   30.3   78.9     38.1  1008.2    0.0  18160   
3    2018-04-13         4.8   30.5   79.7     39.2  1008.3    0.0  17780   
4    2018-04-14         5.0   30.6   79.5     39.4  1007.6    0.0  19379   
...         ...         ...    ...    ...      ...     ...    ...    ...   
2059 2024-09-23         2.5   29.3   72.3     33.8  1007.7    7.2  17307   
2060 2024-09-24         1.4   26.9   81.3     29.7  1009.1   40.2  17298   
2061 2024-09-25         1.2   27.5   80.0     30.8  1009.5   15.6  17275   
2062 2024-09-26         1.5   29.1   74.0     33.8  1009.0    6.4  17289   
2063 2024-09-27         1.8   29.0   74.5     33.6  1008.6    2.4  17295   

      PM2.5  year  ...  Prec._lag2  Prec._lag3  Prec._lag

In [14]:
def safe_to_csv(df: pd.DataFrame, out_path: str, index: bool = False, encoding: str = "utf-8-sig") -> None:
    out_path = Path(out_path)
    out_path.parent.mkdir(parents=True, exist_ok=True)

    # สร้างไฟล์ชั่วคราวในโฟลเดอร์เดียวกัน (กันปัญหา cross-device)
    fd, tmp_path = tempfile.mkstemp(prefix=out_path.stem + ".", suffix=".tmp", dir=str(out_path.parent))
    os.close(fd)  # ปิด handle ทันที (ให้ pandas เขียนเอง)

    try:
        df.to_csv(tmp_path, index=index, encoding=encoding)
        os.replace(tmp_path, out_path)  
    finally:
        if os.path.exists(tmp_path):
            try:
                os.remove(tmp_path)
            except OSError:
                pass

print("CWD =", os.getcwd())
print("Will save to =", Path(r"../data/cleaned_features.csv").resolve())            
safe_to_csv(df, r"../data/cleaned_features.csv")

CWD = d:\Data\CEPP\code\cleaning_code
Will save to = D:\Data\CEPP\code\data\cleaned_features.csv


target = C:\Users\ssupa\Code\Pm2.5_forcast\cleaning_code\Data\cleaned_features.csv
exists = False
