# Feature Engineering

In this notebook, we will create and refine features to improve model performance. 

In [14]:
import pandas as pd
import numpy as np

# I. Data daily

In [15]:
# Load the dataset
read_dir = '../data/processed/'
train_data = pd.read_excel(read_dir + 'train_data.xlsx')
test_data = pd.read_excel(read_dir + 'test_data.xlsx')
X_train = pd.read_excel(read_dir + 'X_train.xlsx')
y_train = pd.read_excel(read_dir + 'y_train.xlsx')
X_test = pd.read_excel(read_dir + 'X_test.xlsx')
y_test = pd.read_excel(read_dir + 'y_test.xlsx')

In [16]:
train_data['spread'] = train_data['tempmax'] - train_data['tempmin']
train_data['temp_dew_diff'] = train_data['temp'] - train_data['dew']
train_data['feelslike_diff'] = train_data['feelslike'] - train_data['temp']
train_data['is_heatwave'] = (train_data['tempmax'].rolling(window=3, min_periods=1).min() >= 35).astype(int)
train_data['PET'] = (0.0023 * train_data['solarenergy'] * 0.408 * np.sqrt(train_data['tempmax'] - train_data['tempmin']) * (train_data['temp'] + 17.8))
train_data['daylight_duration_hours'] = (train_data['sunset'] - train_data['sunrise']).dt.total_seconds() / 3600
train_data['wind_U'] = train_data['windspeed'] * np.sin(2 * np.pi * train_data['winddir'] / 360)
train_data['wind_V'] = train_data['windspeed'] * np.cos(2 * np.pi * train_data['winddir'] / 360)
train_data['pressure_daily_change'] = train_data['sealevelpressure'].diff(3)
train_data['solar_cloud_interaction'] = train_data['solarradiation'] * (1 - (train_data['cloudcover'] / 100))
train_data['wind_gust_ratio'] = train_data['windgust'] / (train_data['windspeed'] + 1e-6)
train_data['month_sin'] = np.sin(2 * np.pi * train_data['datetime'].dt.month / 12)
train_data['month_cos'] = np.cos(2 * np.pi * train_data['datetime'].dt.month / 12)
train_data['day_sin'] = np.sin(2 * np.pi * train_data['datetime'].dt.day_of_year / 365.25)
train_data['day_cos'] = np.cos(2 * np.pi * train_data['datetime'].dt.day_of_year / 365.25)

In [17]:
roll_cols = ['dew', 'humidity', 'precip', 'precipcover', 
            'windgust', 'windspeed',  'sealevelpressure', 
            'cloudcover', 'visibility', 'solarradiation', 
            'solarenergy', 'uvindex', 'conditions_Clear', 
            'conditions_Overcast', 'conditions_Partially cloudy', 
            'conditions_Rain', 'conditions_Rain, Overcast', 
            'conditions_Rain, Partially cloudy','spread', 
            'temp_dew_diff', 'feelslike_diff', 'is_heatwave', 
            'PET', 'daylight_duration_hours', 'wind_U', 'wind_V']

lag_cols = ['spread', 'humidity', 'dew', 'precip',
            'precipcover', 'solarradiation',
            'sealevelpressure', 'windspeed', 'winddir',
            'windgust', 'cloudcover', 'visibility']

cols = ['day_cos', 'day_sin', 'month_cos', 'month_sin', 
        'wind_gust_ratio', 'solar_cloud_interaction', 
        'pressure_daily_change']

windows = [7, 28, 56, 91]
lags = [1, 3, 5, 7]

features_df = pd.DataFrame(index=train_data.index)

all_new_columns = []

for col in roll_cols:
    col_name_upper = col.upper()
    for w in windows:
        mean_name = f"{w}D_AVG_{col_name_upper}"
        var_name = f"{w}D_VAR_{col_name_upper}"
        
        mean_series = train_data[col].shift(1).rolling(window=w).mean()
        mean_series.name = mean_name
        
        var_series = train_data[col].shift(1).rolling(window=w).var()
        var_series.name = var_name
        
        all_new_columns.append(mean_series)
        all_new_columns.append(var_series)

for col in lag_cols:
    col_name_upper = col.upper()
    for l in lags:
        lag_name = f"{col_name_upper}_LAG_{l}"
        
        lag_series = train_data[col].shift(l)
        lag_series.name = lag_name
        
        all_new_columns.append(lag_series)

features_df = pd.concat([features_df] + all_new_columns, axis=1)

for col in cols:
    features_df[col] = train_data[col] 

features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3134 entries, 0 to 3133
Columns: 263 entries, 7D_AVG_DEW to pressure_daily_change
dtypes: float64(263)
memory usage: 6.3 MB


  features_df[col] = train_data[col]
  features_df[col] = train_data[col]
  features_df[col] = train_data[col]
  features_df[col] = train_data[col]
  features_df[col] = train_data[col]
  features_df[col] = train_data[col]
  features_df[col] = train_data[col]


In [18]:
# Lưu dữ liệu tập train test để training

# II. Data hourly

In [19]:
# Load the dataset
train_data_h = pd.read_excel(read_dir + 'train_data_h.xlsx')
test_data_h = pd.read_excel(read_dir + 'test_data_h.xlsx')
X_train_h = pd.read_excel(read_dir + 'X_train_h.xlsx')
y_train_h = pd.read_excel(read_dir + 'y_train_h.xlsx')
X_test_h = pd.read_excel(read_dir + 'X_test_h.xlsx')
y_test_h = pd.read_excel(read_dir + 'y_test_h.xlsx')

In [20]:
# Tính toán các feature

In [21]:
# Lưu dữ liệu tập train test để training