# Feature Engineering

In this notebook, we will create and refine features to improve model performance. 

In [1]:
import pandas as pd
import numpy as np

# I. Data daily

In [2]:
# Load the dataset
read_dir = '../data/processed/'
train_data = pd.read_excel(read_dir + 'train_data.xlsx')
test_data = pd.read_excel(read_dir + 'test_data.xlsx')
X_train = pd.read_excel(read_dir + 'X_train.xlsx')
y_train = pd.read_excel(read_dir + 'y_train.xlsx')
X_test = pd.read_excel(read_dir + 'X_test.xlsx')
y_test = pd.read_excel(read_dir + 'y_test.xlsx')

In [3]:
train_data['spread'] = train_data['tempmax'] - train_data['tempmin']
train_data['temp_dew_diff'] = train_data['temp'] - train_data['dew']
train_data['feelslike_diff'] = train_data['feelslike'] - train_data['temp']
train_data['is_heatwave'] = (train_data['tempmax'].rolling(window=3, min_periods=1).min() >= 35).astype(int)
train_data['air_saturation'] = train_data['humidity'] / train_data['dew']
train_data['wind_gust_ratio'] = train_data['windgust'] / train_data['windspeed']
train_data['PET'] = (0.0023 * train_data['solarenergy'] * 0.408 * np.sqrt(train_data['tempmax'] - train_data['tempmin']) * (train_data['temp'] + 17.8))
train_data['daylight_duration_hours'] = (train_data['sunset'] - train_data['sunrise']).dt.total_seconds() / 3600
train_data['wind_U'] = train_data['windspeed'] * np.sin(2 * np.pi * train_data['winddir'] / 360)
train_data['wind_V'] = train_data['windspeed'] * np.cos(2 * np.pi * train_data['winddir'] / 360)

In [5]:
cols_to_roll = ['dew', 
                'humidity', 
                'precip', 
                'precipcover', 
                'windgust', 
                'windspeed', 
                'winddir', 
                'sealevelpressure', 
                'cloudcover', 
                'visibility', 
                'solarradiation', 
                'solarenergy', 
                'uvindex', 
                'severerisk', 
                'moonphase',
                'conditions_Clear', 
                'conditions_Overcast', 
                'conditions_Partially cloudy', 
                'conditions_Rain', 
                'conditions_Rain, Overcast', 
                'conditions_Rain, Partially cloudy',
                'spread', 
                'temp_dew_diff', 
                'feelslike_diff', 
                'is_heatwave', 
                'air_saturation', 
                'wind_gust_ratio', 
                'PET', 
                'daylight_duration_hours', 
                'wind_U', 
                'wind_V']
windows = [7, 14, 21, 28, 35, 42, 49, 56, 63, 70, 77, 84, 91]
features_df = pd.DataFrame({'datetime': train_data['datetime']})
all_new_columns = []

for col in cols_to_roll:
    col_name_upper = col.upper()
    for w in windows:
        mean_name = f"{w}D_AVG_{col_name_upper}"
        var_name = f"{w}D_VAR_{col_name_upper}"
        
        mean_series = train_data[col].rolling(window=w).mean()
        mean_series.name = mean_name
        
        var_series = train_data[col].rolling(window=w).var()
        var_series.name = var_name
        
        all_new_columns.append(mean_series)
        all_new_columns.append(var_series)

features_df = pd.concat([features_df] + all_new_columns, axis=1)

features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3134 entries, 0 to 3133
Columns: 807 entries, datetime to 91D_VAR_WIND_V
dtypes: datetime64[ns](1), float64(806)
memory usage: 19.3 MB


In [64]:
# Lưu dữ liệu tập train test để training

# II. Data hourly

In [65]:
# Load the dataset
train_data_h = pd.read_excel(read_dir + 'train_data_h.xlsx')
test_data_h = pd.read_excel(read_dir + 'test_data_h.xlsx')
X_train_h = pd.read_excel(read_dir + 'X_train_h.xlsx')
y_train_h = pd.read_excel(read_dir + 'y_train_h.xlsx')
X_test_h = pd.read_excel(read_dir + 'X_test_h.xlsx')
y_test_h = pd.read_excel(read_dir + 'y_test_h.xlsx')

In [66]:
# Tính toán các feature

In [67]:
# Lưu dữ liệu tập train test để training