# Feature Engineering

In this notebook, we will create and refine features to improve model performance. 

In [16]:
import pandas as pd
import numpy as np

# I. Data daily

In [17]:
# Load the dataset
read_dir = '../data/processed/'
train_data = pd.read_excel(read_dir + 'train_data.xlsx')
test_data = pd.read_excel(read_dir + 'test_data.xlsx')
#X_train = pd.read_excel(read_dir + 'X_train.xlsx')
#y_train = pd.read_excel(read_dir + 'y_train.xlsx')
#X_test = pd.read_excel(read_dir + 'X_test.xlsx')
#y_test = pd.read_excel(read_dir + 'y_test.xlsx')

In [18]:
def create_features_and_split(data):
    df = data.copy()

    df['spread'] = df['tempmax'] - df['tempmin']
    df['temp_dew_diff'] = df['temp'] - df['dew']
    df['feelslike_diff'] = df['feelslike'] - df['temp']
    df['is_heatwave'] = (df['tempmax'].rolling(window=3, min_periods=1).min() >= 35).astype(int)
    df['PET'] = (0.0023 * df['solarenergy'] * 0.408 * np.sqrt(df['tempmax'] - df['tempmin']) * (df['temp'] + 17.8))
    df['daylight_duration_hours'] = (df['sunset'] - df['sunrise']).dt.total_seconds() / 3600
    df['wind_U'] = df['windspeed'] * np.sin(2 * np.pi * df['winddir'] / 360)
    df['wind_V'] = df['windspeed'] * np.cos(2 * np.pi * df['winddir'] / 360)
    df['pressure_daily_change'] = df['sealevelpressure'].diff(3)
    df['solar_cloud_interaction'] = df['solarradiation'] * (1 - (df['cloudcover'] / 100))
    df['wind_gust_ratio'] = df['windgust'] / (df['windspeed'] + 1e-6)
    
    if 'datetime' in df.columns:
        df['month_sin'] = np.sin(2 * np.pi * df['datetime'].dt.month / 12)
        df['month_cos'] = np.cos(2 * np.pi * df['datetime'].dt.month / 12)
        df['day_sin'] = np.sin(2 * np.pi * df['datetime'].dt.day_of_year / 365.25)
        df['day_cos'] = np.cos(2 * np.pi * df['datetime'].dt.day_of_year / 365.25)

    roll_cols = ['dew', 'humidity', 'precip', 'precipcover', 
                 'windgust', 'windspeed', 'sealevelpressure', 
                 'cloudcover', 'visibility', 'solarradiation', 
                 'solarenergy', 'uvindex', 'conditions_Clear', 
                 'conditions_Overcast', 'conditions_Partially cloudy', 
                 'conditions_Rain', 'conditions_Rain, Overcast', 
                 'conditions_Rain, Partially cloudy','spread', 
                 'temp_dew_diff', 'feelslike_diff', 'is_heatwave', 
                 'PET', 'daylight_duration_hours', 'wind_U', 'wind_V']

    lag_cols = ['spread', 'humidity', 'dew', 'precip',
                'precipcover', 'solarradiation',
                'sealevelpressure', 'windspeed', 'winddir',
                'windgust', 'cloudcover', 'visibility']

    cols = ['day_cos', 'day_sin', 'month_cos', 'month_sin', 
            'wind_gust_ratio', 'solar_cloud_interaction', 
            'pressure_daily_change']

    windows = [7, 28, 56, 91]
    lags = [1, 3, 5, 7]

    all_features_list = [] 

    for col in roll_cols:
        if col not in df.columns: continue
        col_name_upper = col.upper()
        for w in windows:
            mean_name = f"{w}D_AVG_{col_name_upper}"
            var_name = f"{w}D_VAR_{col_name_upper}"
            
            mean_series = df[col].shift(1).rolling(window=w).mean()
            mean_series.name = mean_name
            all_features_list.append(mean_series) 
            
            var_series = df[col].shift(1).rolling(window=w).var()
            var_series.name = var_name
            all_features_list.append(var_series) 

    for col in lag_cols:
        if col not in df.columns: continue
        col_name_upper = col.upper()
        for l in lags:
            lag_name = f"{col_name_upper}_LAG_{l}"
            lag_series = df[col].shift(l)
            lag_series.name = lag_name
            all_features_list.append(lag_series) 

    for col in cols:
        if col in df.columns:
            all_features_list.append(df[col]) 

    features_df = pd.concat(all_features_list, axis=1)

    target_data = {
        'y_temp_1': df['temp'].shift(-1),
        'y_temp_2': df['temp'].shift(-2),
        'y_temp_3': df['temp'].shift(-3),
        'y_temp_4': df['temp'].shift(-4),
        'y_temp_5': df['temp'].shift(-5)
    }
    y = pd.DataFrame(target_data, index=df.index)

    full_df = pd.concat([features_df, y], axis=1)
    full_df = full_df.dropna()

    target_cols = list(target_data.keys())
    
    X = full_df.drop(columns=target_cols)
    y = full_df[target_cols]

    return X, y

In [19]:
X_train, y_train = create_features_and_split(train_data)
X_test, y_test = create_features_and_split(test_data)

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error 

X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

models = {}
metrics = {}
total_mae = 0.0
total_mse = 0.0
target_cols = list(y_train.columns) 

for target_col in target_cols:
    model = RandomForestRegressor(
        n_estimators=300,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_train, y_train[target_col])

    models[target_col] = model
    
    y_test_pred = model.predict(X_test) 
    
    mae = mean_absolute_error(y_test[target_col], y_test_pred) 
    mse = mean_squared_error(y_test[target_col], y_test_pred) 

    metrics[target_col] = {'MAE': mae, 'MSE': mse}
    print(f"{target_col}: MSE={mse:.3f}, MAE={mae:.3f}")

    total_mae += mae
    total_mse += mse

avg_mae = total_mae / len(target_cols)
avg_mse = total_mse / len(target_cols)

print(f"Average MSE: {avg_mse:.3f}")
print(f"Average MAE: {avg_mae:.3f}")

y_temp_1: MSE=4.179, MAE=1.633
y_temp_2: MSE=5.993, MAE=1.955
y_temp_3: MSE=6.976, MAE=2.100
y_temp_4: MSE=7.130, MAE=2.111
y_temp_5: MSE=7.153, MAE=2.116
Average MSE: 6.286
Average MAE: 1.983


In [21]:
# Lưu dữ liệu tập train test để training

# II. Data hourly

In [22]:
# Load the dataset
train_data_h = pd.read_excel(read_dir + 'train_data_h.xlsx')
test_data_h = pd.read_excel(read_dir + 'test_data_h.xlsx')
X_train_h = pd.read_excel(read_dir + 'X_train_h.xlsx')
y_train_h = pd.read_excel(read_dir + 'y_train_h.xlsx')
X_test_h = pd.read_excel(read_dir + 'X_test_h.xlsx')
y_test_h = pd.read_excel(read_dir + 'y_test_h.xlsx')

In [23]:
# Tính toán các feature

In [24]:
# Lưu dữ liệu tập train test để training