In [2]:
import pandas as pd
import numpy as np
from functions import load_data, get_train_targets, get_test_data, prepare_submission, remove_ouliers

data_a, data_b, data_c = load_data()

data_a = remove_ouliers(data_a)
data_b = remove_ouliers(data_b)
data_c = remove_ouliers(data_c)

X_train_a, targets_a = get_train_targets(data_a)
X_train_b, targets_b = get_train_targets(data_b)
X_train_c, targets_c = get_train_targets(data_c)

X_test_a, X_test_b, X_test_c = get_test_data()

In [3]:
drop_cols = ['time', 'date_calc', 'elevation:m', 'fresh_snow_1h:cm',  
             'wind_speed_u_10m:ms', 'wind_speed_v_10m:ms', 'wind_speed_w_1000hPa:ms', 'prob_rime:p',
             'fresh_snow_12h:cm','fresh_snow_24h:cm', 'fresh_snow_6h:cm', 'super_cooled_liquid_water:kgm2']

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import catboost as cb

class ColumnDropper(BaseEstimator, TransformerMixin):
    """Drops columns from the data."""

    def __init__(self, drop_cols = []):
        self.drop_cols = drop_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        return X_copy.drop(columns=self.drop_cols)

data_process_pipeline = Pipeline([
    ('drop_cols', ColumnDropper(drop_cols=drop_cols)),
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)),
])

locA_pipeline = Pipeline([
    ('data_process', data_process_pipeline),
    ('cat_boost', cb.CatBoostRegressor(depth=8, iterations=200, l2_leaf_reg=4, learning_rate=0.05, random_state=42))
])

locB_pipeline = Pipeline([
    ('data_process', data_process_pipeline),
    ('cat_boost', cb.CatBoostRegressor(depth=6, iterations=200, l2_leaf_reg=0.5, learning_rate=0.05, random_state=42))    
])

locC_pipeline = Pipeline([
    ('data_process', data_process_pipeline),
    ('cat_boost', cb.CatBoostRegressor(depth=4, iterations=300, l2_leaf_reg=1, learning_rate=0.05, random_state=42))
])

In [5]:
locA_pipeline.fit(X_train_a, targets_a)
pred_a = locA_pipeline.predict(X_test_a.drop(columns=["id", "prediction", "location"]))

0:	learn: 1121.3671509	total: 76.6ms	remaining: 15.2s
1:	learn: 1078.1782585	total: 94ms	remaining: 9.3s
2:	learn: 1037.0637165	total: 109ms	remaining: 7.16s
3:	learn: 998.1215522	total: 125ms	remaining: 6.13s
4:	learn: 961.4589052	total: 142ms	remaining: 5.54s
5:	learn: 927.8610968	total: 157ms	remaining: 5.07s
6:	learn: 895.7259281	total: 170ms	remaining: 4.69s
7:	learn: 865.7413007	total: 184ms	remaining: 4.43s
8:	learn: 837.0739290	total: 198ms	remaining: 4.2s
9:	learn: 810.9190062	total: 212ms	remaining: 4.03s
10:	learn: 785.5634542	total: 225ms	remaining: 3.87s
11:	learn: 762.2177039	total: 239ms	remaining: 3.75s
12:	learn: 740.3948179	total: 252ms	remaining: 3.63s
13:	learn: 719.8364876	total: 266ms	remaining: 3.53s
14:	learn: 700.3794990	total: 280ms	remaining: 3.45s
15:	learn: 682.7157755	total: 296ms	remaining: 3.4s
16:	learn: 665.7323274	total: 318ms	remaining: 3.42s
17:	learn: 650.0844609	total: 347ms	remaining: 3.51s
18:	learn: 635.3864886	total: 368ms	remaining: 3.5s
19:	

In [7]:
locA_model = locA_pipeline.named_steps["cat_boost"]
feature_importance_A = pd.DataFrame({"feature":list(X_train_a.drop(columns=drop_cols).columns), "score" : locA_model.feature_importances_})
feature_importance_A.sort_values(by="score", ascending=False)

Unnamed: 0,feature,score
10,direct_rad:W,30.21527
29,sun_azimuth:d,17.511232
4,clear_sky_rad:W,8.333558
30,sun_elevation:d,5.40239
11,direct_rad_1h:J,4.756253
8,diffuse_rad:W,3.668869
12,effective_cloud_cover:p,3.185394
34,wind_speed_10m:ms,3.118144
3,clear_sky_energy_1h:J,2.343191
5,cloud_base_agl:m,2.03041


In [8]:
locB_pipeline.fit(X_train_b, targets_b)
pred_b = locB_pipeline.predict(X_test_b.drop(columns=["id", "prediction", "location"]))

0:	learn: 200.6670824	total: 10ms	remaining: 2s
1:	learn: 192.7608089	total: 18.5ms	remaining: 1.83s
2:	learn: 185.3519311	total: 25ms	remaining: 1.64s
3:	learn: 178.4899151	total: 31.2ms	remaining: 1.53s
4:	learn: 171.8597031	total: 37.5ms	remaining: 1.46s
5:	learn: 165.5479930	total: 43.5ms	remaining: 1.41s
6:	learn: 159.4973034	total: 48.8ms	remaining: 1.34s
7:	learn: 153.9693642	total: 55.4ms	remaining: 1.33s
8:	learn: 148.7176883	total: 61ms	remaining: 1.29s
9:	learn: 143.7756105	total: 68.2ms	remaining: 1.29s
10:	learn: 139.1719894	total: 74.2ms	remaining: 1.27s
11:	learn: 134.8654715	total: 80.8ms	remaining: 1.26s
12:	learn: 130.7033795	total: 86.9ms	remaining: 1.25s
13:	learn: 126.8443935	total: 93.9ms	remaining: 1.25s
14:	learn: 123.1119581	total: 105ms	remaining: 1.29s
15:	learn: 119.8042820	total: 111ms	remaining: 1.27s
16:	learn: 116.6076738	total: 121ms	remaining: 1.3s
17:	learn: 113.5331304	total: 138ms	remaining: 1.4s
18:	learn: 110.6622222	total: 148ms	remaining: 1.41s


In [9]:
locB_model = locB_pipeline.named_steps["cat_boost"]
feature_importance_B = pd.DataFrame({"feature":list(X_train_b.drop(columns=drop_cols).columns), "score" : locB_model.feature_importances_})
feature_importance_B.sort_values(by="score", ascending=False)

Unnamed: 0,feature,score
10,direct_rad:W,25.133334
4,clear_sky_rad:W,20.895965
30,sun_elevation:d,12.856613
29,sun_azimuth:d,9.055172
11,direct_rad_1h:J,5.490817
8,diffuse_rad:W,3.242044
5,cloud_base_agl:m,3.016837
9,diffuse_rad_1h:J,2.34172
33,visibility:m,2.150569
31,t_1000hPa:K,1.737157


In [10]:
locC_pipeline.fit(X_train_c, targets_c)
pred_c = locC_pipeline.predict(X_test_c.drop(columns=["id", "prediction", "location"]))

0:	learn: 170.1419776	total: 13.2ms	remaining: 3.94s
1:	learn: 163.6018454	total: 17.1ms	remaining: 2.55s
2:	learn: 157.2345961	total: 20.8ms	remaining: 2.06s
3:	learn: 151.3032278	total: 26.5ms	remaining: 1.96s
4:	learn: 145.8345562	total: 31.5ms	remaining: 1.86s
5:	learn: 140.5411308	total: 35.9ms	remaining: 1.76s
6:	learn: 135.6307177	total: 39.4ms	remaining: 1.65s
7:	learn: 131.0184482	total: 43.9ms	remaining: 1.6s
8:	learn: 126.6452923	total: 48.1ms	remaining: 1.55s
9:	learn: 122.4271963	total: 52ms	remaining: 1.51s
10:	learn: 118.5106777	total: 56.2ms	remaining: 1.48s
11:	learn: 114.8653397	total: 59.2ms	remaining: 1.42s
12:	learn: 111.3647492	total: 62.6ms	remaining: 1.38s
13:	learn: 108.1880930	total: 65.4ms	remaining: 1.33s
14:	learn: 105.2019299	total: 68.2ms	remaining: 1.29s
15:	learn: 102.3851062	total: 72ms	remaining: 1.28s
16:	learn: 99.7101508	total: 74.8ms	remaining: 1.25s
17:	learn: 97.1655538	total: 77.8ms	remaining: 1.22s
18:	learn: 94.8338001	total: 81ms	remaining: 

In [11]:
locC_model = locC_pipeline.named_steps["cat_boost"]
feature_importance_C = pd.DataFrame({"feature":list(X_train_c.drop(columns=drop_cols).columns), "score" : locC_model.feature_importances_})
feature_importance_C.sort_values(by="score", ascending=False)

Unnamed: 0,feature,score
10,direct_rad:W,35.256284
30,sun_elevation:d,17.756567
4,clear_sky_rad:W,16.687874
11,direct_rad_1h:J,6.179424
31,t_1000hPa:K,4.923974
8,diffuse_rad:W,4.662769
29,sun_azimuth:d,2.064448
3,clear_sky_energy_1h:J,1.628774
7,dew_point_2m:K,1.158499
9,diffuse_rad_1h:J,1.073965


In [12]:
submission = prepare_submission(X_test_a, X_test_b, X_test_c, pred_a, pred_b, pred_c)
submission['prediction'] = submission['prediction'].apply(lambda x: 0 if x < 0.05 else x)

In [13]:
submission.to_csv('submissions/catboost_optimised_params.csv', index=False)