In [1]:
import pandas as pd
import numpy as np
from functions import load_data, get_train_targets, get_test_data, prepare_submission, remove_ouliers

data_a, data_b, data_c = load_data()

data_a = remove_ouliers(data_a)
data_b = remove_ouliers(data_b)
data_c = remove_ouliers(data_c)

X_train_a, targets_a = get_train_targets(data_a)
X_train_b, targets_b = get_train_targets(data_b)
X_train_c, targets_c = get_train_targets(data_c)

X_test_a, X_test_b, X_test_c = get_test_data()

In [2]:
locA_features = ['clear_sky_energy_1h:J', 'cloud_base_agl:m', 'dew_point_2m:K', 
                 'diffuse_rad_1h:J', 'direct_rad_1h:J', 'sun_azimuth:d', 'sun_elevation:d', 
                 'wind_speed_u_10m:ms','sfc_pressure:hPa','snow_water:kgm2','precip_5min:mm', 
                 'fresh_snow_3h:cm']

drop_colsA = list(set(list(X_train_a.columns)) - set(locA_features))

log_transformA = ['cloud_base_agl:m', 'dew_point_2m:K', 'diffuse_rad_1h:J', 'direct_rad_1h:J', 'sun_elevation:d', 
                 'snow_water:kgm2','precip_5min:mm', 'fresh_snow_3h:cm','dew_point_2m:K']
log_transformB = ['cloud_base_agl:m', 'dew_point_2m:K', 'diffuse_rad:W', 'direct_rad:W', 'sun_elevation:d', 
                 'snow_water:kgm2','precip_5min:mm', 'fresh_snow_3h:cm']
log_transformC = ['cloud_base_agl:m', 'dew_point_2m:K', 'diffuse_rad:W', 'direct_rad:W', 'sun_elevation:d', 
                 'snow_water:kgm2','precip_5min:mm', 'fresh_snow_3h:cm','dew_point_2m:K']

locB_features = ['clear_sky_rad:W', 'cloud_base_agl:m',
                 'diffuse_rad:W', 'direct_rad:W', 'sun_azimuth:d', 'sun_elevation:d', 
                 'wind_speed_u_10m:ms','sfc_pressure:hPa','snow_water:kgm2','precip_5min:mm', 
                 'fresh_snow_3h:cm','dew_point_2m:K']

drop_colsB = list(set(list(X_train_b.columns)) - set(locB_features))


locC_features = ['clear_sky_rad:W', 'cloud_base_agl:m',
                 'diffuse_rad:W', 'direct_rad:W', 'sun_azimuth:d', 'sun_elevation:d', 
                 'wind_speed_u_10m:ms','sfc_pressure:hPa','snow_water:kgm2','precip_5min:mm', 
                 'fresh_snow_3h:cm','dew_point_2m:K']

drop_colsC = list(set(list(X_train_c.columns)) - set(locC_features))

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

class ColumnDropper(BaseEstimator, TransformerMixin):
    """Drops columns from the data."""

    def __init__(self, drop_cols = []):
        self.drop_cols = drop_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        return X_copy.drop(columns=self.drop_cols)
    
class LogTransformer(BaseEstimator, TransformerMixin):
    """Does a log transform on specified columns."""

    def __init__(self, cols = []):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        X_copy[self.cols].apply(lambda x: np.log(x))
        return X_copy.drop(columns=self.cols)


locA_pipeline = Pipeline([
    ('drop_cols', ColumnDropper(drop_cols=drop_colsA)),
    ('log_transform', LogTransformer(cols=log_transformA)),
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)),
    ('standard_scaler', StandardScaler()),
    ('svr', SVR())
])

locB_pipeline = Pipeline([
    ('drop_cols', ColumnDropper(drop_cols=drop_colsB)),
    ('log_transform', LogTransformer(cols=log_transformB)),
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)),
    ('standard_scaler', StandardScaler()),
    ('svr', SVR())
])

locC_pipeline = Pipeline([
    ('drop_cols', ColumnDropper(drop_cols=drop_colsC)),
    ('log_transform', LogTransformer(cols=log_transformC)),
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)),
    ('standard_scaler', StandardScaler()),
    ('svr', SVR())
])

In [4]:
locA_pipeline.fit(X_train_a, targets_a)
pred_a = locA_pipeline.predict(X_test_a.drop(columns=["id", "prediction", "location"]))

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [5]:
locB_pipeline.fit(X_train_b, targets_b)
pred_b = locB_pipeline.predict(X_test_b.drop(columns=["id", "prediction", "location"]))

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [6]:
locC_pipeline.fit(X_train_c, targets_c)
pred_c = locC_pipeline.predict(X_test_c.drop(columns=["id", "prediction", "location"]))

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [7]:
from sklearn.metrics import mean_squared_error

train_predA = locA_pipeline.predict(X_train_a)
# train_predB = locB_pipeline.predict(X_train_b)
# train_predC = locC_pipeline.predict(X_train_c)

# print("MSA A:", mean_squared_error(train_predA, targets_a))
# print("MSA B:", mean_squared_error(train_predB, targets_b))
# print("MSA C:", mean_squared_error(train_predC, targets_c))

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [7]:
submission = prepare_submission(X_test_a, X_test_b, X_test_c, pred_a, pred_b, pred_c)
submission['prediction'] = submission['prediction'].apply(lambda x: 0 if x < 0.05 else x)

In [8]:
submission

Unnamed: 0,id,prediction
0,0,0.158619
1,1,0.000000
2,2,1.419603
3,3,4.101368
4,4,45.666944
...,...,...
715,2155,22.152547
716,2156,5.763198
717,2157,0.491911
718,2158,0.103863


In [9]:
submission.to_csv('submissions/SVR_default_params_few_features.csv', index=False)