Trying a new approach to the feature selection, mainly using feature importance scores for different models.  
Will run some pipelines for different models, for each of them select some features and do some hyperparameter tuning - then I can combine them using stacking (and each model can use different features!).

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
import sys

import data_preprocess as dp # data_preprocess.py

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [2]:
# get training data with all the features
data = dp.data_preprocess(one_hot_location=True)

features = ['absolute_humidity_2m:gm3',
       'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J',
       'clear_sky_rad:W', 'cloud_base_agl:m', 'dew_or_rime:idx',
       'dew_point_2m:K', 'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W',
       'direct_rad_1h:J', 'effective_cloud_cover:p', 'elevation:m',
       'fresh_snow_12h:cm', 'fresh_snow_1h:cm', 'fresh_snow_24h:cm',
       'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'is_day:idx',
       'is_in_shadow:idx', 'msl_pressure:hPa', 'precip_5min:mm',
       'precip_type_5min:idx', 'pressure_100m:hPa', 'pressure_50m:hPa',
       'prob_rime:p', 'rain_water:kgm2', 'relative_humidity_1000hPa:p',
       'sfc_pressure:hPa', 'snow_density:kgm3', 'snow_depth:cm',
       'snow_drift:idx', 'snow_melt_10min:mm', 'snow_water:kgm2',
       'sun_azimuth:d', 'sun_elevation:d', 'super_cooled_liquid_water:kgm2',
       't_1000hPa:K', 'total_cloud_cover:p', 'visibility:m',
       'wind_speed_10m:ms', 'wind_speed_u_10m:ms', 'wind_speed_v_10m:ms',
       'wind_speed_w_1000hPa:ms', 'A', 'B', 'C', 'time']

X_train, targets = dp.get_training_data(data, features)

In [3]:
# define the different types of features (categorical, one-hot, and numerical)
cat_cols = ['is_day:idx', 'is_in_shadow:idx','month', 'hour', 'day']

one_hot_cols = ['A', 'B', 'C']

num_cols = ['absolute_humidity_2m:gm3',
       'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J',
       'clear_sky_rad:W', 'cloud_base_agl:m', 'dew_or_rime:idx',
       'dew_point_2m:K', 'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W',
       'direct_rad_1h:J', 'effective_cloud_cover:p', 'elevation:m',
       'fresh_snow_12h:cm', 'fresh_snow_1h:cm', 'fresh_snow_24h:cm',
       'fresh_snow_3h:cm', 'fresh_snow_6h:cm','msl_pressure:hPa', 'precip_5min:mm',
       'precip_type_5min:idx', 'pressure_100m:hPa', 'pressure_50m:hPa',
       'prob_rime:p', 'rain_water:kgm2', 'relative_humidity_1000hPa:p',
       'sfc_pressure:hPa', 'snow_density:kgm3', 'snow_depth:cm',
       'snow_drift:idx', 'snow_melt_10min:mm', 'snow_water:kgm2',
       'sun_azimuth:d', 'sun_elevation:d', 'super_cooled_liquid_water:kgm2',
       't_1000hPa:K', 'total_cloud_cover:p', 'visibility:m',
       'wind_speed_10m:ms', 'wind_speed_u_10m:ms', 'wind_speed_v_10m:ms',
       'wind_speed_w_1000hPa:ms']

In [11]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.impute import SimpleImputer
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor

In [71]:
column_trans = ColumnTransformer(
    [('categories', OneHotEncoder(dtype='int'), cat_cols),
     ('numerical', MinMaxScaler(), num_cols),
     ('one_hot_allready', 'passthrough', one_hot_cols),],
     remainder='drop', verbose_feature_names_out=True)

class TimeFeatureAdder(BaseEstimator, TransformerMixin):
    """Adds the features month and hour to the data"""

    def __init__(self, add_features=True):
        self.add_features = add_features

    def fit(self, X, y=None):
        return self

    def transform(self, X):

        X_copy = X.copy()

        timestamps = X["time"]
        month = timestamps.apply(lambda x: x.month)
        hour = timestamps.apply(lambda x: x.hour)
        day = timestamps.apply(lambda x: x.day)

        if self.add_features:
            X_copy["month"] = month
            X_copy["hour"] = hour
            X_copy["day"] = day
            return X_copy
        else:
            return X_copy
        
class ColumnDropper(BaseEstimator, TransformerMixin):
    """Drops the given columns from the data. Columns are indexes!!"""

    def __init__(self, drop_cols = []):
        self.drop_cols = drop_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):

        X_copy = X.copy()

        if self.drop_cols is not None:
            X_filtered = np.delete(X_copy, self.drop_cols, axis=1)

        return X_filtered

In [72]:
data_process_pipeline = Pipeline([
    ('add_features', TimeFeatureAdder()),
    ('column_transform', column_trans),
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)),
    ('column_dropper', ColumnDropper())
])

random_forest_pipeline = Pipeline([
    ('data_process', data_process_pipeline),
    ('random_forest', RandomForestRegressor(random_state=42))
])

grad_boost_pipeline = Pipeline([
    ('data_process', data_process_pipeline),
    ('grad_boost', GradientBoostingRegressor(random_state=42))
])

sgd_regressor_pipeline = Pipeline([
    ('data_process', data_process_pipeline),
    ('grad_boost', SGDRegressor(random_state=42))
])

svr_regressor_pipeline = Pipeline([
    ('data_process', data_process_pipeline),
    ('grad_boost', SVR())
])

## RANDOM FOREST

In [25]:
from evaluate import mean_squared_error

random_forest_pipeline.fit(X_train, targets)
train_predictions = random_forest_pipeline.predict(X_train)
mean_squared_error(train_predictions, targets)

9387.61875852462

In [62]:
# get the 30 least important features

import pandas as pd

# get the rf model from the pipeline
random_forest_model = random_forest_pipeline.named_steps['random_forest']

# get the column transformer and feature names
preprocessor = data_process_pipeline.named_steps['column_transform']
all_feature_names = list(preprocessor.named_transformers_['categories'].get_feature_names_out()) + list(preprocessor.named_transformers_['numerical'].get_feature_names_out()) + one_hot_cols

# get the feature importance
feature_importances = random_forest_model.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': all_feature_names,
    'Importance': feature_importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

rf_least_important_features = list(feature_importance_df.tail(30)["Feature"])
rf_least_important_feature_index = list(feature_importance_df.tail(30).index)

NotFittedError: This RandomForestRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [66]:
rf_least_important_feature_index

[21,
 3,
 85,
 116,
 100,
 89,
 20,
 4,
 33,
 102,
 34,
 15,
 19,
 88,
 95,
 113,
 77,
 86,
 35,
 18,
 36,
 37,
 0,
 17,
 38,
 39,
 1,
 16,
 101,
 99]

In [52]:
rf_least_important_features

['hour_5',
 'is_in_shadow:idx_1.0',
 'fresh_snow_12h:cm',
 'C',
 'snow_depth:cm',
 'fresh_snow_6h:cm',
 'hour_4',
 'month_1',
 'hour_17',
 'snow_melt_10min:mm',
 'hour_18',
 'month_12',
 'hour_3',
 'fresh_snow_3h:cm',
 'prob_rime:p',
 'wind_speed_w_1000hPa:ms',
 'dew_or_rime:idx',
 'fresh_snow_1h:cm',
 'hour_19',
 'hour_2',
 'hour_20',
 'hour_21',
 'is_day:idx_0.0',
 'hour_1',
 'hour_22',
 'hour_23',
 'is_day:idx_1.0',
 'hour_0',
 'snow_drift:idx',
 'snow_density:kgm3']

second try - now having dropped the 30 least important columns

In [73]:
# Set the drop_cols parameter for the ColumnDropper object in data process
random_forest_pipeline.named_steps['data_process'].named_steps['column_dropper'].drop_cols = rf_least_important_feature_index

In [74]:
random_forest_pipeline.fit(X_train, targets)
train_predictions = random_forest_pipeline.predict(X_train)
mean_squared_error(train_predictions, targets)

9367.48673555216

## GRADIENT BOOSTING

## STOCHASTIC GRADIENT DESCENT REGRESSOR

## SUPPORT VECTOR MACHINE REGRESSOR