In [35]:
import pandas as pd
import numpy as np
from functions import load_data, get_train_targets, get_test_data, prepare_submission, remove_ouliers

data_a, data_b, data_c = load_data()

data_a = remove_ouliers(data_a)
data_b = remove_ouliers(data_b)
data_c = remove_ouliers(data_c)

X_train_a, targets_a = get_train_targets(data_a)
X_train_b, targets_b = get_train_targets(data_b)
X_train_c, targets_c = get_train_targets(data_c)

X_test_a, X_test_b, X_test_c = get_test_data()

In [36]:
drop_cols = ['time', 'date_calc', 'elevation:m', 'fresh_snow_1h:cm',  
             'wind_speed_u_10m:ms', 'wind_speed_v_10m:ms', 'wind_speed_w_1000hPa:ms', 'prob_rime:p',
             'fresh_snow_12h:cm','fresh_snow_24h:cm', 'fresh_snow_6h:cm', 'super_cooled_liquid_water:kgm2']

# drop_cols = ['time', 'date_calc', 
#              "snow_drift:idx", "snow_depth:cm", "fresh_snow_6h:cm", "fresh_snow_3h:cm", "is_in_shadow:idx", 
#              "fresh_snow_1h:cm", "elevation:m", "dew_or_rime:idx", "wind_speed_w_1000hPa:ms",
#              "fresh_snow_12h:cm", 
#              "prob_rime:p",
#              "is_day:idx",
#              "snow_density:kgm3"]


In [47]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import catboost as cb

class ColumnDropper(BaseEstimator, TransformerMixin):
    """Drops columns from the data."""

    def __init__(self, drop_cols = []):
        self.drop_cols = drop_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        return X_copy.drop(columns=self.drop_cols)

data_process_pipeline = Pipeline([
    ('drop_cols', ColumnDropper(drop_cols=drop_cols)),
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)),
])

locA_pipeline = Pipeline([
    ('data_process', data_process_pipeline),
    ('cat_boost', cb.CatBoostRegressor(iterations=200, random_state=2))
])

locB_pipeline = Pipeline([
    ('data_process', data_process_pipeline),
    ('cat_boost', cb.CatBoostRegressor(iterations=200, random_state=2))    
])

locC_pipeline = Pipeline([
    ('data_process', data_process_pipeline),
    ('cat_boost', cb.CatBoostRegressor(iterations=300, random_state=2))
])

In [48]:
locA_pipeline.fit(X_train_a, targets_a)
pred_a = locA_pipeline.predict(X_test_a.drop(columns=["id", "prediction", "location"]))

Learning rate set to 0.264541
0:	learn: 928.4019059	total: 8.21ms	remaining: 1.63s
1:	learn: 770.8792743	total: 14.7ms	remaining: 1.46s
2:	learn: 661.2777399	total: 21.8ms	remaining: 1.43s
3:	learn: 587.6983094	total: 29ms	remaining: 1.42s
4:	learn: 541.6029351	total: 35.8ms	remaining: 1.4s
5:	learn: 513.7208101	total: 42.9ms	remaining: 1.39s
6:	learn: 494.1428336	total: 52.1ms	remaining: 1.44s
7:	learn: 481.3591955	total: 58.4ms	remaining: 1.4s
8:	learn: 472.5884211	total: 67.4ms	remaining: 1.43s
9:	learn: 466.1367274	total: 74.8ms	remaining: 1.42s
10:	learn: 462.8251882	total: 82.2ms	remaining: 1.41s
11:	learn: 459.5063186	total: 89.4ms	remaining: 1.4s
12:	learn: 456.9094510	total: 97.9ms	remaining: 1.41s
13:	learn: 455.1474538	total: 107ms	remaining: 1.42s
14:	learn: 453.3734176	total: 113ms	remaining: 1.4s
15:	learn: 451.4098568	total: 119ms	remaining: 1.37s
16:	learn: 449.9397440	total: 126ms	remaining: 1.35s
17:	learn: 448.1385541	total: 133ms	remaining: 1.35s
18:	learn: 446.6921

In [28]:
locA_model = locA_pipeline.named_steps["cat_boost"]
feature_importance_A = pd.DataFrame({"feature":list(X_train_a.drop(columns=drop_cols).columns), "score" : locA_model.feature_importances_})
feature_importance_A.sort_values(by="score", ascending=False)

Unnamed: 0,feature,score
9,direct_rad:W,33.355166
23,sun_azimuth:d,11.133698
24,sun_elevation:d,7.252454
7,diffuse_rad:W,6.641624
4,clear_sky_rad:W,6.446661
30,wind_speed_u_10m:ms,3.445258
10,direct_rad_1h:J,2.542057
8,diffuse_rad_1h:J,2.333777
5,cloud_base_agl:m,2.133909
28,visibility:m,2.031796


In [49]:
locB_pipeline.fit(X_train_b, targets_b)
pred_b = locB_pipeline.predict(X_test_b.drop(columns=["id", "prediction", "location"]))

Learning rate set to 0.255224
0:	learn: 167.5568773	total: 24.4ms	remaining: 4.86s
1:	learn: 137.8425038	total: 38.3ms	remaining: 3.79s
2:	learn: 117.4484118	total: 50.3ms	remaining: 3.3s
3:	learn: 102.8981089	total: 57.5ms	remaining: 2.81s
4:	learn: 93.0344710	total: 65.5ms	remaining: 2.55s
5:	learn: 86.1790644	total: 72ms	remaining: 2.33s
6:	learn: 81.7900012	total: 79.1ms	remaining: 2.18s
7:	learn: 78.8643340	total: 86.5ms	remaining: 2.08s
8:	learn: 76.7203433	total: 92.2ms	remaining: 1.96s
9:	learn: 75.4584811	total: 100ms	remaining: 1.91s
10:	learn: 74.4739653	total: 106ms	remaining: 1.82s
11:	learn: 73.7202260	total: 113ms	remaining: 1.77s
12:	learn: 73.1248864	total: 121ms	remaining: 1.74s
13:	learn: 72.6041725	total: 127ms	remaining: 1.68s
14:	learn: 72.2379963	total: 135ms	remaining: 1.66s
15:	learn: 71.8241195	total: 140ms	remaining: 1.61s
16:	learn: 71.3615089	total: 147ms	remaining: 1.59s
17:	learn: 71.1157575	total: 154ms	remaining: 1.55s
18:	learn: 70.8678356	total: 159ms

In [30]:
locB_model = locB_pipeline.named_steps["cat_boost"]
feature_importance_B = pd.DataFrame({"feature":list(X_train_b.drop(columns=drop_cols).columns), "score" : locB_model.feature_importances_})
feature_importance_B.sort_values(by="score", ascending=False)

Unnamed: 0,feature,score
9,direct_rad:W,22.303368
24,sun_elevation:d,16.341335
4,clear_sky_rad:W,13.108703
23,sun_azimuth:d,7.381431
7,diffuse_rad:W,4.360548
11,effective_cloud_cover:p,3.891058
5,cloud_base_agl:m,2.993527
30,wind_speed_u_10m:ms,2.598756
10,direct_rad_1h:J,2.444001
8,diffuse_rad_1h:J,2.286231


In [50]:
locC_pipeline.fit(X_train_c, targets_c)
pred_c = locC_pipeline.predict(X_test_c.drop(columns=["id", "prediction", "location"]))

Learning rate set to 0.177439
0:	learn: 151.8363123	total: 7.17ms	remaining: 2.15s
1:	learn: 132.0333907	total: 12.9ms	remaining: 1.92s
2:	learn: 115.7600821	total: 21.5ms	remaining: 2.13s
3:	learn: 103.2899011	total: 41.4ms	remaining: 3.06s
4:	learn: 93.3550287	total: 49.7ms	remaining: 2.93s
5:	learn: 85.9860820	total: 56.3ms	remaining: 2.76s
6:	learn: 80.0883330	total: 63ms	remaining: 2.64s
7:	learn: 75.4137285	total: 69.8ms	remaining: 2.55s
8:	learn: 71.8567060	total: 75ms	remaining: 2.42s
9:	learn: 69.3523007	total: 85.3ms	remaining: 2.47s
10:	learn: 67.2239788	total: 103ms	remaining: 2.71s
11:	learn: 65.5187319	total: 113ms	remaining: 2.71s
12:	learn: 64.2608008	total: 129ms	remaining: 2.85s
13:	learn: 63.2911323	total: 141ms	remaining: 2.87s
14:	learn: 62.4695382	total: 152ms	remaining: 2.89s
15:	learn: 61.8358301	total: 158ms	remaining: 2.81s
16:	learn: 61.3470085	total: 170ms	remaining: 2.83s
17:	learn: 60.8234273	total: 178ms	remaining: 2.79s
18:	learn: 60.4685793	total: 186ms

In [32]:
locC_model = locC_pipeline.named_steps["cat_boost"]
feature_importance_C = pd.DataFrame({"feature":list(X_train_c.drop(columns=drop_cols).columns), "score" : locC_model.feature_importances_})
feature_importance_C.sort_values(by="score", ascending=False)

Unnamed: 0,feature,score
9,direct_rad:W,20.694655
4,clear_sky_rad:W,15.692912
24,sun_elevation:d,12.17636
3,clear_sky_energy_1h:J,5.150643
26,t_1000hPa:K,5.017872
7,diffuse_rad:W,4.880834
10,direct_rad_1h:J,4.777215
8,diffuse_rad_1h:J,3.131264
23,sun_azimuth:d,2.970497
30,wind_speed_u_10m:ms,2.774694


In [51]:
submission = prepare_submission(X_test_a, X_test_b, X_test_c, pred_a, pred_b, pred_c)
submission['prediction'] = submission['prediction'].apply(lambda x: 0 if x < 0.05 else x)

In [42]:
catboost_original = pd.read_csv("submissions/catboost.csv")

In [46]:
catboost_original["prediction"]

0         0.000000
1         0.000000
2         0.000000
3        22.880650
4       369.844314
           ...    
2155     45.601744
2156     21.496748
2157      5.472147
2158      3.333924
2159      3.849325
Name: prediction, Length: 2160, dtype: float64

In [52]:
submission["prediction"]

0        0.000000
1        0.000000
2        7.963181
3       10.716886
4      253.258250
          ...    
715     33.501934
716     17.606249
717      3.030321
718      5.406594
719      3.140792
Name: prediction, Length: 2160, dtype: float64

In [45]:
submission["prediction"]

0        0.000000
1        0.000000
2        1.339933
3       16.297397
4      282.161716
          ...    
715     45.641325
716     23.382621
717     11.387922
718      4.268377
719      4.002117
Name: prediction, Length: 2160, dtype: float64

In [34]:
submission.to_csv('submissions/catboost_4_nov.csv', index=False)