Trying a new approach to the feature selection, mainly using feature importance scores for different models.  
Will run some pipelines for different models, for each of them select some features and do some hyperparameter tuning - then I can combine them using stacking (and each model can use different features!).

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
import sys

import data_preprocess as dp # data_preprocess.py

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [2]:
# get training data with all the features
data = dp.data_preprocess(one_hot_location=True)

features = ['absolute_humidity_2m:gm3',
       'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J',
       'clear_sky_rad:W', 'cloud_base_agl:m', 'dew_or_rime:idx',
       'dew_point_2m:K', 'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W',
       'direct_rad_1h:J', 'effective_cloud_cover:p', 'elevation:m',
       'fresh_snow_12h:cm', 'fresh_snow_1h:cm', 'fresh_snow_24h:cm',
       'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'is_day:idx',
       'is_in_shadow:idx', 'msl_pressure:hPa', 'precip_5min:mm',
       'precip_type_5min:idx', 'pressure_100m:hPa', 'pressure_50m:hPa',
       'prob_rime:p', 'rain_water:kgm2', 'relative_humidity_1000hPa:p',
       'sfc_pressure:hPa', 'snow_density:kgm3', 'snow_depth:cm',
       'snow_drift:idx', 'snow_melt_10min:mm', 'snow_water:kgm2',
       'sun_azimuth:d', 'sun_elevation:d', 'super_cooled_liquid_water:kgm2',
       't_1000hPa:K', 'total_cloud_cover:p', 'visibility:m',
       'wind_speed_10m:ms', 'wind_speed_u_10m:ms', 'wind_speed_v_10m:ms',
       'wind_speed_w_1000hPa:ms', 'A', 'B', 'C', 'time']

X_train, targets = dp.get_training_data(data, features)

In [3]:
# define the different types of features (categorical, one-hot, and numerical)
cat_cols = ['is_day:idx', 'is_in_shadow:idx','month', 'hour', 'day']

one_hot_cols = ['A', 'B', 'C']

num_cols = ['absolute_humidity_2m:gm3',
       'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J',
       'clear_sky_rad:W', 'cloud_base_agl:m', 'dew_or_rime:idx',
       'dew_point_2m:K', 'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W',
       'direct_rad_1h:J', 'effective_cloud_cover:p', 'elevation:m',
       'fresh_snow_12h:cm', 'fresh_snow_1h:cm', 'fresh_snow_24h:cm',
       'fresh_snow_3h:cm', 'fresh_snow_6h:cm','msl_pressure:hPa', 'precip_5min:mm',
       'precip_type_5min:idx', 'pressure_100m:hPa', 'pressure_50m:hPa',
       'prob_rime:p', 'rain_water:kgm2', 'relative_humidity_1000hPa:p',
       'sfc_pressure:hPa', 'snow_density:kgm3', 'snow_depth:cm',
       'snow_drift:idx', 'snow_melt_10min:mm', 'snow_water:kgm2',
       'sun_azimuth:d', 'sun_elevation:d', 'super_cooled_liquid_water:kgm2',
       't_1000hPa:K', 'total_cloud_cover:p', 'visibility:m',
       'wind_speed_10m:ms', 'wind_speed_u_10m:ms', 'wind_speed_v_10m:ms',
       'wind_speed_w_1000hPa:ms']

In [11]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.impute import SimpleImputer
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor

In [23]:
column_trans = ColumnTransformer(
    [('categories', OneHotEncoder(dtype='int'), cat_cols),
     ('numerical', MinMaxScaler(), num_cols),
     ('one_hot_allready', 'passthrough', one_hot_cols),],
     remainder='drop', verbose_feature_names_out=True)

class TimeFeatureAdder(BaseEstimator, TransformerMixin):
    """Adds the features month and hour to the data"""

    def __init__(self, add_features=True):
        self.add_features = add_features

    def fit(self, X, y=None):
        return self

    def transform(self, X):

        X_copy = X.copy()

        timestamps = X["time"]
        month = timestamps.apply(lambda x: x.month)
        hour = timestamps.apply(lambda x: x.hour)
        day = timestamps.apply(lambda x: x.day)

        if self.add_features:
            X_copy["month"] = month
            X_copy["hour"] = hour
            X_copy["day"] = day
            return X_copy
        else:
            return X_copy

In [24]:
data_process_pipeline = Pipeline([
    ('add_features', TimeFeatureAdder()),
    ('column_transform', column_trans),
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)),
])

random_forest_pipeline = Pipeline([
    ('data_process', data_process_pipeline),
    ('random_forest', RandomForestRegressor(random_state=42))
])

grad_boost_pipeline = Pipeline([
    ('data_process', data_process_pipeline),
    ('grad_boost', GradientBoostingRegressor(random_state=42))
])

sgd_regressor_pipeline = Pipeline([
    ('data_process', data_process_pipeline),
    ('grad_boost', SGDRegressor(random_state=42))
])

svr_regressor_pipeline = Pipeline([
    ('data_process', data_process_pipeline),
    ('grad_boost', SVR())
])

## RANDOM FOREST

In [25]:
from evaluate import mean_squared_error

random_forest_pipeline.fit(X_train, targets)
train_predictions = random_forest_pipeline.predict(X_train)
mean_squared_error(train_predictions, targets)

In [18]:
import pandas as pd

# get the rf model from the pipeline
random_forest_model = random_forest_pipeline.named_steps['random_forest']

# get the column transformer and feature names
preprocessor = data_process_pipeline.named_steps['column_transform']
feature_names_after_encoding = preprocessor.named_transformers_['categories'].get_feature_names_out()
all_feature_names = list(X_train.columns) + list(feature_names_after_encoding)

# get the feature importance
feature_importances = random_forest_model.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': all_feature_names,
    'Importance': feature_importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

In [22]:
len(features)

49

In [21]:
len(cat_cols) + len(one_hot_cols) + len(num_cols)

51

In [20]:
feature_importances

array([4.50547221e-07, 2.05535855e-07, 1.94338500e-04, 9.60739821e-05,
       2.19957262e-05, 5.27315333e-04, 2.15676180e-04, 2.68515560e-04,
       3.40682690e-04, 3.84149165e-04, 4.93846579e-04, 6.84295144e-04,
       2.42054366e-04, 1.92463078e-04, 1.17250395e-04, 4.49485817e-06,
       1.09420265e-07, 3.28872278e-07, 6.57594462e-07, 3.88019754e-06,
       2.20252053e-05, 1.01493097e-04, 1.14143332e-04, 2.73186118e-04,
       6.41975735e-04, 5.76849495e-04, 5.23582884e-04, 3.83122212e-04,
       2.61766650e-04, 3.27226672e-04, 2.27468605e-04, 1.05161894e-04,
       1.11226550e-04, 1.95835743e-05, 1.44925183e-05, 1.07054191e-06,
       6.24202573e-07, 5.85902383e-07, 2.53556687e-07, 2.51028764e-07,
       6.11106919e-04, 2.56970697e-04, 2.47056751e-04, 3.57943827e-04,
       2.42068615e-04, 7.65304747e-04, 4.52422432e-04, 3.22236474e-04,
       4.56187494e-04, 2.10305959e-04, 2.87497633e-04, 2.15120174e-04,
       3.30742187e-04, 2.93685024e-04, 3.02439549e-04, 5.30187024e-04,
      

## GRADIENT BOOSTING

## STOCHASTIC GRADIENT DESCENT REGRESSOR

## SUPPORT VECTOR MACHINE REGRESSOR