notebook used to generate a random forest model with all the features, then using the highest-important to get a new random forest with just those features. best-performing model as of 19.10.23

In [1]:
from data_preprocess import data_preprocess, get_input_data, get_training_data

data = data_preprocess(one_hot_location=True)



In [2]:
# add a column for month of the year
month = data['time'].apply(lambda x: x.month)

In [3]:
features = ['absolute_humidity_2m:gm3',
       'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J',
       'clear_sky_rad:W', 'cloud_base_agl:m', 'dew_or_rime:idx',
       'dew_point_2m:K', 'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W',
       'direct_rad_1h:J', 'effective_cloud_cover:p', 'elevation:m',
       'fresh_snow_12h:cm', 'fresh_snow_1h:cm', 'fresh_snow_24h:cm',
       'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'is_day:idx',
       'is_in_shadow:idx', 'msl_pressure:hPa', 'precip_5min:mm',
       'precip_type_5min:idx', 'pressure_100m:hPa', 'pressure_50m:hPa',
       'prob_rime:p', 'rain_water:kgm2', 'relative_humidity_1000hPa:p',
       'sfc_pressure:hPa', 'snow_density:kgm3', 'snow_depth:cm',
       'snow_drift:idx', 'snow_melt_10min:mm', 'snow_water:kgm2',
       'sun_azimuth:d', 'sun_elevation:d', 'super_cooled_liquid_water:kgm2',
       't_1000hPa:K', 'total_cloud_cover:p', 'visibility:m',
       'wind_speed_10m:ms', 'wind_speed_u_10m:ms', 'wind_speed_v_10m:ms',
       'wind_speed_w_1000hPa:ms', 'A', 'B', 'C']

In [4]:
X_train, targets = get_training_data(data, features)

In [5]:
X_train["month"] = month

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train["month"] = month


In [6]:
X_train["ceiling_height_agl:m"].fillna(0, inplace=True)
X_train["cloud_base_agl:m"].fillna(0, inplace=True)
X_train["snow_density:kgm3"].fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train["ceiling_height_agl:m"].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train["cloud_base_agl:m"].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train["snow_density:kgm3"].fillna(0, inplace=True)


In [55]:
from sklearn.ensemble import RandomForestRegressor

X = X_train.values
y = targets

forest_reg = RandomForestRegressor()
forest_reg.fit(X, y)

In [56]:
import pandas as pd

names_list = []
scores_list = []

for name, score, in zip(features, forest_reg.feature_importances_):
    names_list.append(name)
    scores_list.append(score)

feature_importance = pd.DataFrame({"feature": names_list, "relative_importance": scores_list})

feature_importance.sort_values("relative_importance", ascending=False)

Unnamed: 0,feature,relative_importance
10,direct_rad:W,0.389946
13,elevation:m,0.216673
45,A,0.153387
8,diffuse_rad:W,0.062372
35,sun_azimuth:d,0.039919
36,sun_elevation:d,0.011239
4,clear_sky_rad:W,0.009573
42,wind_speed_u_10m:ms,0.008431
43,wind_speed_v_10m:ms,0.00737
11,direct_rad_1h:J,0.006998


In [57]:
from evaluate import training_mse

predictions = forest_reg.predict(X)
training_mse(y, predictions)

MSE on training data: 9582.367251541513


In [60]:
X_test = get_input_data()
X_test['month'] = X_test['time'].apply(lambda x: x.month)
X_test_features = X_test[features].fillna(0)
X_test_features['month'] = X_test['month']



In [61]:
from data_preprocess import prepare_submission, get_input_data

predictions = forest_reg.predict(X_test_features.values)

In [62]:
submission = prepare_submission(X_test, predictions)

In [63]:
submission.to_csv('submissions/random_forest_all_features_plus_month.csv', index=False)

# some minimal feature engineering - dropping the least important features and re-running same model!

In [26]:
drop_cols = ["precip_type_5min:idx", "is_in_shadow:idx", "rain_water:kgm2", "B", "fresh_snow_12h:cm", "C", "fresh_snow_6h:cm",
             "snow_depth:cm", "snow_density:kgm3", "snow_melt_10min:mm", "prob_rime:p", "fresh_snow_3h:cm", "dew_or_rime:idx",
             "wind_speed_w_1000hPa:ms", "fresh_snow_1h:cm", "is_day:idx", "snow_drift:idx"]

In [27]:
X_train_drop = X_train.drop(columns=drop_cols)

In [28]:
from sklearn.ensemble import RandomForestRegressor

X = X_train_drop.values
y = targets

forest_reg = RandomForestRegressor()
forest_reg.fit(X, y)

In [33]:
len(drop_cols)

30

In [29]:
new_features = [f for f in features if f not in drop_cols]

In [30]:
X_test = get_input_data()
X_test['month'] = X_test['time'].apply(lambda x: x.month)
X_test_features = X_test[new_features].fillna(0)
X_test_features['month'] = X_test['month']

from data_preprocess import prepare_submission, get_input_data

predictions = forest_reg.predict(X_test_features.values)

submission = prepare_submission(X_test, predictions)



In [31]:
submission.to_csv('submissions/random_forest_friday_fewer_ft.csv', index=False)

In [19]:
len(new_features)

31

In [25]:
import pandas as pd

names_list = []
scores_list = []

for name, score, in zip(new_features, forest_reg.feature_importances_):
    names_list.append(name)
    scores_list.append(score)

feature_importance = pd.DataFrame({"feature": names_list, "relative_importance": scores_list})

feature_importance.sort_values("relative_importance", ascending=False)

Unnamed: 0,feature,relative_importance
9,direct_rad:W,0.391001
12,elevation:m,0.185892
30,A,0.184983
7,diffuse_rad:W,0.062772
21,sun_azimuth:d,0.039727
22,sun_elevation:d,0.010675
4,clear_sky_rad:W,0.009763
28,wind_speed_u_10m:ms,0.008023
10,direct_rad_1h:J,0.007018
26,visibility:m,0.006935
