In [2]:
import pandas as pd
import numpy as np
from functions import load_data, get_train_targets, get_test_data, prepare_submission, remove_ouliers

data_a, data_b, data_c = load_data()

data_a = remove_ouliers(data_a)
data_b = remove_ouliers(data_b)
data_c = remove_ouliers(data_c)

X_train_a, targets_a = get_train_targets(data_a)
X_train_b, targets_b = get_train_targets(data_b)
X_train_c, targets_c = get_train_targets(data_c)

X_test_a, X_test_b, X_test_c = get_test_data()

In [3]:
drop_cols = ['time', 'date_calc', 'elevation:m', 'fresh_snow_1h:cm',  
             'wind_speed_u_10m:ms', 'wind_speed_v_10m:ms', 'wind_speed_w_1000hPa:ms', 'prob_rime:p',
             'fresh_snow_12h:cm','fresh_snow_24h:cm', 'fresh_snow_6h:cm', 'super_cooled_liquid_water:kgm2']

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import catboost as cb

class ColumnDropper(BaseEstimator, TransformerMixin):
    """Drops columns from the data."""

    def __init__(self, drop_cols = []):
        self.drop_cols = drop_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        return X_copy.drop(columns=self.drop_cols)

data_process_pipeline = Pipeline([
    ('drop_cols', ColumnDropper(drop_cols=drop_cols)),
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)),
])

locA_pipeline = Pipeline([
    ('data_process', data_process_pipeline),
    ('cat_boost', cb.CatBoostRegressor(loss_function='RMSE'))
])

locB_pipeline = Pipeline([
    ('data_process', data_process_pipeline),
    ('cat_boost', cb.CatBoostRegressor(loss_function='RMSE'))
])

locC_pipeline = Pipeline([
    ('data_process', data_process_pipeline),
    ('cat_boost', cb.CatBoostRegressor(loss_function='RMSE'))
])

In [5]:
locA_pipeline.fit(X_train_a, targets_a)
pred_a = locA_pipeline.predict(X_test_a.drop(columns=["id", "prediction", "location"]))

Learning rate set to 0.071487
0:	learn: 1101.3607628	total: 71.5ms	remaining: 1m 11s
1:	learn: 1040.5277870	total: 81.8ms	remaining: 40.8s
2:	learn: 984.8137576	total: 90.6ms	remaining: 30.1s
3:	learn: 934.4844720	total: 98.8ms	remaining: 24.6s
4:	learn: 888.9697955	total: 106ms	remaining: 21s
5:	learn: 846.4520873	total: 112ms	remaining: 18.5s
6:	learn: 808.6459926	total: 121ms	remaining: 17.1s
7:	learn: 774.3212800	total: 128ms	remaining: 15.9s
8:	learn: 742.4434294	total: 135ms	remaining: 14.8s
9:	learn: 713.5524795	total: 143ms	remaining: 14.1s
10:	learn: 687.2490907	total: 153ms	remaining: 13.7s
11:	learn: 663.3304118	total: 165ms	remaining: 13.6s
12:	learn: 641.8701275	total: 176ms	remaining: 13.4s
13:	learn: 622.4358405	total: 190ms	remaining: 13.4s
14:	learn: 604.9896387	total: 223ms	remaining: 14.7s
15:	learn: 589.7734808	total: 249ms	remaining: 15.3s
16:	learn: 576.0470847	total: 275ms	remaining: 15.9s
17:	learn: 563.2308941	total: 305ms	remaining: 16.6s
18:	learn: 552.158336

In [6]:
locB_pipeline.fit(X_train_b, targets_b)
pred_b = locB_pipeline.predict(X_test_b.drop(columns=["id", "prediction", "location"]))

Learning rate set to 0.06897
0:	learn: 197.5075803	total: 27.7ms	remaining: 27.7s
1:	learn: 187.1004316	total: 52.5ms	remaining: 26.2s
2:	learn: 177.3139286	total: 74.4ms	remaining: 24.7s
3:	learn: 168.2990534	total: 96ms	remaining: 23.9s
4:	learn: 160.0225816	total: 107ms	remaining: 21.3s
5:	learn: 152.1731871	total: 116ms	remaining: 19.2s
6:	learn: 144.9717658	total: 126ms	remaining: 17.9s
7:	learn: 138.4864322	total: 137ms	remaining: 17s
8:	learn: 132.4690991	total: 147ms	remaining: 16.2s
9:	learn: 127.1703049	total: 157ms	remaining: 15.6s
10:	learn: 122.1599005	total: 164ms	remaining: 14.8s
11:	learn: 117.4957682	total: 175ms	remaining: 14.4s
12:	learn: 113.3366189	total: 187ms	remaining: 14.2s
13:	learn: 109.4194543	total: 194ms	remaining: 13.7s
14:	learn: 105.8628515	total: 203ms	remaining: 13.3s
15:	learn: 102.6869637	total: 222ms	remaining: 13.6s
16:	learn: 99.8312045	total: 238ms	remaining: 13.7s
17:	learn: 97.1822139	total: 250ms	remaining: 13.6s
18:	learn: 94.7578505	total: 

In [7]:
locC_pipeline.fit(X_train_c, targets_c)
pred_c = locC_pipeline.predict(X_test_c.drop(columns=["id", "prediction", "location"]))

Learning rate set to 0.066673
0:	learn: 167.6269557	total: 10.5ms	remaining: 10.5s
1:	learn: 158.8828033	total: 16.7ms	remaining: 8.32s
2:	learn: 150.5268448	total: 22.1ms	remaining: 7.33s
3:	learn: 143.1455878	total: 27.3ms	remaining: 6.79s
4:	learn: 136.1622449	total: 32.3ms	remaining: 6.42s
5:	learn: 129.7831115	total: 37.7ms	remaining: 6.25s
6:	learn: 123.8633038	total: 42.7ms	remaining: 6.06s
7:	learn: 118.3174534	total: 47.8ms	remaining: 5.93s
8:	learn: 113.3177256	total: 53.2ms	remaining: 5.86s
9:	learn: 108.6706453	total: 58.3ms	remaining: 5.77s
10:	learn: 104.4989678	total: 63.1ms	remaining: 5.67s
11:	learn: 100.4864721	total: 71.6ms	remaining: 5.9s
12:	learn: 96.9572531	total: 76.7ms	remaining: 5.82s
13:	learn: 93.6133358	total: 81.8ms	remaining: 5.76s
14:	learn: 90.5622939	total: 87ms	remaining: 5.71s
15:	learn: 87.6957896	total: 91.9ms	remaining: 5.65s
16:	learn: 85.1766366	total: 96.8ms	remaining: 5.6s
17:	learn: 82.8793283	total: 102ms	remaining: 5.57s
18:	learn: 80.71252

In [9]:
submission = prepare_submission(X_test_a, X_test_b, X_test_c, pred_a, pred_b, pred_c)
submission['prediction'] = submission['prediction'].apply(lambda x: 0 if x < 0.05 else x)

In [10]:
submission.to_csv('submissions/catboost.csv', index=False)