In [28]:
import pandas as pd
import numpy as np
from functions import load_data, get_train_targets, get_test_data, prepare_submission, remove_ouliers

data_a, data_b, data_c = load_data()

data_a = remove_ouliers(data_a)
data_b = remove_ouliers(data_b)
data_c = remove_ouliers(data_c)

X_train_a, targets_a = get_train_targets(data_a)
X_train_b, targets_b = get_train_targets(data_b)
X_train_c, targets_c = get_train_targets(data_c)

X_test_a, X_test_b, X_test_c = get_test_data()

In [29]:
drop_cols = ['time', 'date_calc', 'elevation:m', 'fresh_snow_1h:cm',  
             'wind_speed_u_10m:ms', 'wind_speed_v_10m:ms', 'wind_speed_w_1000hPa:ms', 'prob_rime:p',
             'fresh_snow_12h:cm','fresh_snow_24h:cm', 'fresh_snow_6h:cm', 'super_cooled_liquid_water:kgm2']

In [36]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import catboost as cb

class ColumnDropper(BaseEstimator, TransformerMixin):
    """Drops columns from the data."""

    def __init__(self, drop_cols = []):
        self.drop_cols = drop_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        return X_copy.drop(columns=self.drop_cols)

data_process_pipeline = Pipeline([
    ('drop_cols', ColumnDropper(drop_cols=drop_cols)),
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)),
])

locA_pipeline = Pipeline([
    ('data_process', data_process_pipeline),
    ('cat_boost', cb.CatBoostRegressor(loss_function='RMSE', iterations=300, random_state=42))
])

locB_pipeline = Pipeline([
    ('data_process', data_process_pipeline),
    ('cat_boost', cb.CatBoostRegressor(loss_function='RMSE', iterations=300, random_state=42))
])

locC_pipeline = Pipeline([
    ('data_process', data_process_pipeline),
    ('cat_boost', cb.CatBoostRegressor(loss_function='RMSE', iterations=300, random_state=42))
])

In [37]:
locA_pipeline.fit(X_train_a, targets_a)
pred_a = locA_pipeline.predict(X_test_a.drop(columns=["id", "prediction", "location"]))

Learning rate set to 0.071487
0:	learn: 1101.3607628	total: 9.02ms	remaining: 9.01s
1:	learn: 1040.5277870	total: 21.6ms	remaining: 10.8s
2:	learn: 984.8137576	total: 31.7ms	remaining: 10.5s
3:	learn: 934.4844720	total: 40.3ms	remaining: 10s
4:	learn: 888.9697955	total: 52.1ms	remaining: 10.4s
5:	learn: 846.4520873	total: 60.4ms	remaining: 10s
6:	learn: 808.6459926	total: 70.5ms	remaining: 9.99s
7:	learn: 774.3212800	total: 79.9ms	remaining: 9.91s
8:	learn: 742.4434294	total: 87ms	remaining: 9.58s
9:	learn: 713.5524795	total: 95ms	remaining: 9.41s
10:	learn: 687.2490907	total: 103ms	remaining: 9.26s
11:	learn: 663.3304118	total: 110ms	remaining: 9.09s
12:	learn: 641.8701275	total: 120ms	remaining: 9.08s
13:	learn: 622.4358405	total: 127ms	remaining: 8.91s
14:	learn: 604.9896387	total: 133ms	remaining: 8.75s
15:	learn: 589.7734808	total: 139ms	remaining: 8.58s
16:	learn: 576.0470847	total: 147ms	remaining: 8.52s
17:	learn: 563.2308941	total: 154ms	remaining: 8.39s
18:	learn: 552.1583365

In [22]:
locB_pipeline.fit(X_train_b, targets_b)
pred_b = locB_pipeline.predict(X_test_b.drop(columns=["id", "prediction", "location"]))

Learning rate set to 0.06897
0:	learn: 197.6051333	total: 35.5ms	remaining: 35.5s
1:	learn: 187.0238196	total: 79.6ms	remaining: 39.7s
2:	learn: 177.4492830	total: 117ms	remaining: 38.8s
3:	learn: 168.5973214	total: 152ms	remaining: 37.7s
4:	learn: 160.2470473	total: 176ms	remaining: 35s
5:	learn: 152.4505397	total: 243ms	remaining: 40.2s
6:	learn: 145.2987170	total: 254ms	remaining: 36s
7:	learn: 138.8846448	total: 268ms	remaining: 33.3s
8:	learn: 133.0559306	total: 280ms	remaining: 30.8s
9:	learn: 127.6706903	total: 289ms	remaining: 28.6s
10:	learn: 122.5655020	total: 299ms	remaining: 26.9s
11:	learn: 117.7897605	total: 308ms	remaining: 25.4s
12:	learn: 113.5541270	total: 317ms	remaining: 24s
13:	learn: 109.7021175	total: 326ms	remaining: 22.9s
14:	learn: 106.2872394	total: 332ms	remaining: 21.8s
15:	learn: 103.0697131	total: 340ms	remaining: 20.9s
16:	learn: 100.1831766	total: 348ms	remaining: 20.1s
17:	learn: 97.5762043	total: 355ms	remaining: 19.4s
18:	learn: 95.1451968	total: 364

In [24]:
locC_pipeline.fit(X_train_c, targets_c)
pred_c = locC_pipeline.predict(X_test_c.drop(columns=["id", "prediction", "location"]))

Learning rate set to 0.066673
0:	learn: 167.5067547	total: 8.61ms	remaining: 8.6s
1:	learn: 158.7095105	total: 15.9ms	remaining: 7.91s
2:	learn: 150.4976315	total: 21.8ms	remaining: 7.26s
3:	learn: 142.9865015	total: 27.8ms	remaining: 6.93s
4:	learn: 136.0157471	total: 36.5ms	remaining: 7.26s
5:	learn: 129.5390157	total: 42.3ms	remaining: 7s
6:	learn: 123.6160002	total: 51.7ms	remaining: 7.33s
7:	learn: 118.1397742	total: 57.7ms	remaining: 7.15s
8:	learn: 113.0114629	total: 64.3ms	remaining: 7.08s
9:	learn: 108.3721412	total: 70ms	remaining: 6.92s
10:	learn: 104.1658532	total: 75.6ms	remaining: 6.8s
11:	learn: 100.1351542	total: 82.3ms	remaining: 6.78s
12:	learn: 96.5005947	total: 88.2ms	remaining: 6.69s
13:	learn: 93.1716261	total: 93.7ms	remaining: 6.6s
14:	learn: 90.1831443	total: 101ms	remaining: 6.62s
15:	learn: 87.4226423	total: 106ms	remaining: 6.54s
16:	learn: 85.0023583	total: 112ms	remaining: 6.5s
17:	learn: 82.6979554	total: 118ms	remaining: 6.45s
18:	learn: 80.6294946	total

In [26]:
submission = prepare_submission(X_test_a, X_test_b, X_test_c, pred_a, pred_b, pred_c)
submission['prediction'] = submission['prediction'].apply(lambda x: 0 if x < 0.05 else x)

In [27]:
submission.to_csv('submissions/catboost_all_features.csv', index=False)